From 3d7dde77ebdcfe6083b0961d6dc637b41a53c0b6 Mon Sep 17 00:00:00 2001 From: Nikita Konstantinov Date: Wed, 7 Nov 2018 20:29:04 +0300 Subject: [PATCH] Improve Lexer's error handling of invalid UTF-8 strings. --- Exception/InternalError.php | 18 ++++++++++++++++++ Llk/Lexer.php | 32 +++++++++++++++++++++++++++++--- Test/Unit/Llk/Lexer.php | 23 +++++++++++++++++++++++ 3 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 Exception/InternalError.php diff --git a/Exception/InternalError.php b/Exception/InternalError.php new file mode 100644 index 00000000..005998c9 --- /dev/null +++ b/Exception/InternalError.php @@ -0,0 +1,18 @@ +validateInputInUnicodeMode($text); + $this->_text = $text; $this->_tokens = $tokens; $this->_nsStack = null; @@ -272,9 +275,9 @@ protected function nextToken($offset) */ protected function matchLexeme($lexeme, $regex, $offset) { - $_regex = str_replace('#', '\#', $regex); - $preg = preg_match( - '#\G(?|' . $_regex . ')#' . $this->_pcreOptions, + $_regex = '#\G(?|' . str_replace('#', '\#', $regex) . ')#' . $this->_pcreOptions; + $preg = @preg_match( + $_regex, $this->_text, $matches, 0, @@ -285,6 +288,16 @@ protected function matchLexeme($lexeme, $regex, $offset) return null; } + if (false === $preg) { + throw new InternalError( + sprintf( + 'Lexer encountered a PCRE error (code: %d), full regex: "%s".', + preg_last_error(), + $_regex + ) + ); + } + if ('' === $matches[0]) { throw new Compiler\Exception\Lexer( 'A lexeme must not match an empty value, which is the ' . @@ -300,4 +313,17 @@ protected function matchLexeme($lexeme, $regex, $offset) 'length' => mb_strlen($matches[0]) ]; } + + /** + * @param string $text + * @return bool + */ + private function validateInputInUnicodeMode($text) + { + if (strpos($this->_pcreOptions, 'u') !== false && !mb_check_encoding($text, 'utf-8')) { + throw new Compiler\Exception\Lexer( + 'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.' + ); + } + } } diff --git a/Test/Unit/Llk/Lexer.php b/Test/Unit/Llk/Lexer.php index 58a93f14..373bc9d6 100644 --- a/Test/Unit/Llk/Lexer.php +++ b/Test/Unit/Llk/Lexer.php @@ -496,4 +496,27 @@ public function case_unicode_disabled() ' ↑' ); } + + public function case_invalid_utf8_with_unicode_mode() + { + $this + ->given( + $lexer = new SUT(['lexer.unicode' => true]), + $datum = "\xFF", + $tokens = [ + 'default' => [ + 'foo' => "\xFF" + ] + ] + ) + ->when($result = $lexer->lexMe($datum, $tokens)) + ->then + ->exception(function () use ($result) { + $result->next(); + }) + ->isInstanceOf(LUT\Exception\Lexer::class) + ->hasMessage( + 'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.' + ); + } }