diff --git a/src/simple_tokenizer.cc b/src/simple_tokenizer.cc index 955fe5e..983670e 100644 --- a/src/simple_tokenizer.cc +++ b/src/simple_tokenizer.cc @@ -2,11 +2,9 @@ #include #include -#include -#include -#include +#include +#include #include -#include #include namespace simple_tokenizer { @@ -22,13 +20,18 @@ PinYin *SimpleTokenizer::get_pinyin() { } static TokenCategory from_char(char c) { - if (std::isdigit(c)) { + auto uc = static_cast(c); + // ASCII should in 0..127 + if (uc > 127) { + return TokenCategory::OTHER; + } + if (std::isdigit(uc)) { return TokenCategory::DIGIT; } - if (std::isspace(c) || std::iscntrl(c)) { + if (std::isspace(uc) || std::iscntrl(uc)) { return TokenCategory::SPACE; } - if (std::isalpha(c)) { + if (std::isalpha(uc)) { return TokenCategory::ASCII_ALPHABETIC; } return TokenCategory::OTHER;