Skip to content

Commit

Permalink
update codepoint
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryAWE committed Nov 25, 2023
1 parent 34b1257 commit 55f2382
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 111 deletions.
56 changes: 43 additions & 13 deletions include/papilio/utf/codepoint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#ifdef PAPILIO_COMPILER_MSVC
# pragma warning(push)
// Variable 'variable' is uninitialized. Always initialize a member variable (type.6).
# pragma warning(disable:26495)
#endif

Expand Down Expand Up @@ -39,13 +40,11 @@ namespace papilio::utf
class decoder<char16_t>
{
public:
static auto to_char32_t(std::u16string_view ch) -> std::pair<char32_t, std::uint8_t>;
static auto to_char32_t(char16_t first, char16_t second = u'\0') -> std::pair<char32_t, std::uint8_t>;
static constexpr auto to_char32_t(std::u16string_view ch) -> std::pair<char32_t, std::uint8_t>;
static constexpr auto to_char32_t(char16_t first, char16_t second = u'\0') -> std::pair<char32_t, std::uint8_t>;

static std::uint8_t size_bytes(char16_t ch) noexcept;

static auto to_codepoint(std::u16string_view ch) -> std::pair<codepoint, std::uint8_t>;
static auto to_codepoint(char16_t first, char16_t second = u'\0') -> std::pair<codepoint, std::uint8_t>;
static constexpr auto to_codepoint(std::u16string_view ch) -> std::pair<codepoint, std::uint8_t>;
static constexpr auto to_codepoint(char16_t first, char16_t second = u'\0') -> std::pair<codepoint, std::uint8_t>;

struct from_codepoint_result
{
Expand All @@ -59,17 +58,17 @@ namespace papilio::utf
std::uint8_t processed_size = 0;

[[nodiscard]]
std::u16string_view get() const noexcept
constexpr std::u16string_view get() const noexcept
{
return std::u16string_view(chars, size);
}
operator std::u16string_view() const noexcept
constexpr operator std::u16string_view() const noexcept
{
return get();
}
};

static auto from_codepoint(codepoint cp) -> from_codepoint_result;
static constexpr auto from_codepoint(codepoint cp) -> from_codepoint_result;
};
template <>
class decoder<wchar_t>
Expand Down Expand Up @@ -220,7 +219,7 @@ namespace papilio::utf
return std::bit_cast<const char*>(u8data());
}
[[nodiscard]]
constexpr std::uint8_t size() const noexcept
constexpr std::uint8_t size_bytes() const noexcept
{
return byte_count(m_data[0]);
}
Expand All @@ -231,11 +230,11 @@ namespace papilio::utf
}
explicit constexpr operator std::u8string_view() const noexcept
{
return std::u8string_view(m_data, size());
return std::u8string_view(m_data, size_bytes());
}
explicit constexpr operator std::string_view() const noexcept
{
return std::string_view(data(), size());
return std::string_view(data(), size_bytes());
}

template <std::integral To = char8_t>
Expand All @@ -249,7 +248,7 @@ namespace papilio::utf
static_cast<To>(m_data[3])
};

return std::make_pair(arr, size());
return std::make_pair(arr, size_bytes());
}

constexpr bool operator==(codepoint rhs) const noexcept
Expand Down Expand Up @@ -289,6 +288,37 @@ namespace papilio::utf
friend std::basic_ostream<char16_t>& operator<<(std::basic_ostream<char16_t>& os, codepoint cp);
friend std::basic_ostream<char32_t>& operator<<(std::basic_ostream<char32_t>& os, codepoint cp);

constexpr std::size_t estimate_width() const
{
// [begin, end) intervals
constexpr std::pair<char32_t, char32_t> estimate_intervals[] =
{
{ 0x1100u, 0x1160u },
{ 0x2329u, 0x232Bu },
{ 0x2E80u, 0x303Fu },
{ 0x3040u, 0xA4D0u },
{ 0xAC00u, 0xD7A4u },
{ 0xF900u, 0xFB00u },
{ 0xFE10u, 0xFE1Au },
{ 0xFE30u, 0xFE70u },
{ 0xFF00u, 0xFF61u },
{ 0xFFE0u, 0xFFE7u },
{ 0x1F300u, 0x1F650u },
{ 0x1F900u, 0x1FA00u },
{ 0x20000u, 0x2FFFEu },
{ 0x30000u, 0x3FFFEu }
};

char32_t ch = static_cast<char32_t>(*this);
for(const auto& i : estimate_intervals)
{
if(i.first <= ch && ch < i.second)
return 2;
}

return 1;
}

private:
char8_t m_data[4];
};
Expand Down
71 changes: 69 additions & 2 deletions include/papilio/utf/codepoint.inl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace papilio::utf
return 4;
}

constexpr auto decoder<char32_t>::to_codepoint(char32_t ch)->std::pair<codepoint, std::uint8_t>
constexpr auto decoder<char32_t>::to_codepoint(char32_t ch) -> std::pair<codepoint, std::uint8_t>
{
std::uint8_t len = size_bytes(ch);
char8_t bytes[4] = { 0, 0, 0, 0 };
Expand Down Expand Up @@ -49,7 +49,7 @@ namespace papilio::utf
PAPILIO_UNREACHABLE();
}

return std::make_pair(codepoint(bytes, len), len);
return std::make_pair(codepoint(bytes, len), 1);
}

constexpr auto decoder<char32_t>::from_codepoint(codepoint cp) noexcept -> std::pair<char32_t, std::uint8_t>
Expand Down Expand Up @@ -113,4 +113,71 @@ namespace papilio::utf
std::uint8_t len = size_bytes(ch[0]);
return std::make_pair(codepoint(ch.data(), len), len);
}

constexpr auto decoder<char16_t>::to_char32_t(std::u16string_view ch) -> std::pair<char32_t, std::uint8_t>
{
if(ch.empty()) [[unlikely]]
return std::make_pair(U'\0', 0);

if(!is_high_surrogate(ch[0])) [[likely]]
{
char32_t result = ch[0];
return std::make_pair(result, 1);
}
else
{
if(ch.size() < 2) [[unlikely]]
throw invalid_surrogate(ch[0]);
else if(!is_low_surrogate(ch[1])) [[unlikely]]
throw invalid_surrogate(ch[1]);

char32_t result =
(ch[0] - 0xD800 << 10) +
(ch[1] - 0xDC00) +
0x10000;
return std::make_pair(result, 2);
}
}
constexpr auto decoder<char16_t>::to_char32_t(char16_t first, char16_t second) -> std::pair<char32_t, std::uint8_t>
{
char16_t tmp[2] = { first, second };

return to_char32_t(std::u16string_view(tmp, 2));
}

constexpr auto decoder<char16_t>::to_codepoint(std::u16string_view ch) -> std::pair<codepoint, std::uint8_t>
{
auto [ch32, processed_size] = to_char32_t(ch);
return std::make_pair(decoder<char32_t>::to_codepoint(ch32).first, processed_size);
}
constexpr auto decoder<char16_t>::to_codepoint(char16_t first, char16_t second) -> std::pair<codepoint, std::uint8_t>
{
return decoder<char32_t>::to_codepoint(to_char32_t(first, second).first);
}

constexpr auto decoder<char16_t>::from_codepoint(codepoint cp) -> from_codepoint_result
{
from_codepoint_result result;

char32_t ch32;
std::tie(ch32, result.processed_size) = decoder<char32_t>::from_codepoint(cp);
if(ch32 <= 0xD7FF || (0xE000 <= ch32 && ch32 <= 0xFFFF))
{
result.chars[0] = static_cast<char16_t>(ch32);
result.size = 1;
}
else if(0x10000 <= ch32 && ch32 <= 0x10FFFF)
{
std::uint32_t tmp = ch32 - 0x10000;
result.chars[0] = 0xD800 + (tmp >> 10);
result.chars[1] = 0xDC00 + (tmp & 0x3FF);
result.size = 2;
}
else
{
throw std::invalid_argument("invalid codepoint"); // TODO: Better exception
}

return result;
}
}
83 changes: 3 additions & 80 deletions src/utf/codepoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,85 +4,7 @@

namespace papilio::utf
{
auto decoder<char16_t>::to_char32_t(std::u16string_view ch) -> std::pair<char32_t, std::uint8_t>
{
if(ch.empty()) [[unlikely]]
return std::make_pair(U'\0', 0);

if(!is_high_surrogate(ch[0])) [[likely]]
{
char32_t result = ch[0];
return std::make_pair(result, 1);
}
else
{
if(ch.size() < 2) [[unlikely]]
throw invalid_surrogate(ch[0]);
else if(!is_low_surrogate(ch[1])) [[unlikely]]
throw invalid_surrogate(ch[1]);

char32_t result =
(ch[0] - 0xD800 << 10) +
(ch[1] - 0xDC00) +
0x10000;
return std::make_pair(result, 2);
}
}
auto decoder<char16_t>::to_char32_t(char16_t first, char16_t second) -> std::pair<char32_t, std::uint8_t>
{
char16_t tmp[2] = { first, second };

return to_char32_t(std::u16string_view(tmp, 2));
}

std::uint8_t decoder<char16_t>::size_bytes(char16_t ch) noexcept
{
if(!is_high_surrogate(ch))
{
return 1;
}
else
{
return 2;
}
}

auto decoder<char16_t>::to_codepoint(std::u16string_view ch) -> std::pair<codepoint, std::uint8_t>
{
return decoder<char32_t>::to_codepoint(to_char32_t(ch).first);
}
auto decoder<char16_t>::to_codepoint(char16_t first, char16_t second) -> std::pair<codepoint, std::uint8_t>
{
return decoder<char32_t>::to_codepoint(to_char32_t(first, second).first);
}

auto decoder<char16_t>::from_codepoint(codepoint cp) -> from_codepoint_result
{
from_codepoint_result result;

char32_t ch32;
std::tie(ch32, result.processed_size) = decoder<char32_t>::from_codepoint(cp);
if(ch32 <= 0xD7FF || (0xE000 <= ch32 && ch32 <=0xFFFF))
{
result.chars[0] = static_cast<char16_t>(ch32);
result.size = 1;
}
else if(0x10000 <= ch32 && ch32 <= 0x10FFFF)
{
std::uint32_t tmp = ch32 - 0x10000;
result.chars[0] = 0xD800 + (tmp >> 10);
result.chars[1] = 0xDC00 + (tmp & 0x3FF);
result.size = 2;
}
else
{
throw std::invalid_argument("invalid codepoint"); // TODO: Better exception
}

return result;
}

auto decoder<wchar_t>::to_char32_t(std::wstring_view ch)->std::pair<char32_t, std::uint8_t>
auto decoder<wchar_t>::to_char32_t(std::wstring_view ch) -> std::pair<char32_t, std::uint8_t>
{
if(ch.empty()) [[unlikely]]
return std::make_pair(U'\0', 0);
Expand All @@ -100,7 +22,8 @@ namespace papilio::utf

auto decoder<wchar_t>::to_codepoint(std::wstring_view ch) -> std::pair<codepoint, std::uint8_t>
{
return decoder<char32_t>::to_codepoint(to_char32_t(ch).first);
auto [ch32, processed_size] = to_char32_t(ch);
return std::make_pair(decoder<char32_t>::to_codepoint(ch32).first, processed_size);
}

auto decoder<wchar_t>::from_codepoint(codepoint cp) -> from_codepoint_result
Expand Down
Loading

0 comments on commit 55f2382

Please sign in to comment.