Skip to content
This repository has been archived by the owner on Sep 20, 2021. It is now read-only.

Commit

Permalink
Improve Lexer's error handling of invalid UTF-8 strings.
Browse files Browse the repository at this point in the history
  • Loading branch information
unkind committed Nov 13, 2018
1 parent c620f44 commit 05740f1
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 3 deletions.
18 changes: 18 additions & 0 deletions Exception/InternalError.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?php

namespace Hoa\Compiler\Exception;

use LogicException;

/**
* It probably points to some internal issue of the Hoa Compiler library.
* Regardless source of the bug, try to report about this exception to the library maintainers.
* Even if bug is yours, this exception must not happen.
*/
final class InternalError extends LogicException
{
public function __construct($message, Exception $previous = null)
{
parent::__construct($message, 0, $previous);
}
}
32 changes: 29 additions & 3 deletions Llk/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
namespace Hoa\Compiler\Llk;

use Hoa\Compiler;
use Hoa\Compiler\Exception\InternalError;

/**
* Class \Hoa\Compiler\Llk\Lexer.
Expand Down Expand Up @@ -110,6 +111,8 @@ public function __construct(array $pragmas = [])
*/
public function lexMe($text, array $tokens)
{
$this->validateInputInUnicodeMode($text);

$this->_text = $text;
$this->_tokens = $tokens;
$this->_nsStack = null;
Expand Down Expand Up @@ -272,9 +275,9 @@ protected function nextToken($offset)
*/
protected function matchLexeme($lexeme, $regex, $offset)
{
$_regex = str_replace('#', '\#', $regex);
$preg = preg_match(
'#\G(?|' . $_regex . ')#' . $this->_pcreOptions,
$_regex = '#\G(?|' . str_replace('#', '\#', $regex) . ')#' . $this->_pcreOptions;
$preg = @preg_match(
$_regex,
$this->_text,
$matches,
0,
Expand All @@ -285,6 +288,16 @@ protected function matchLexeme($lexeme, $regex, $offset)
return null;
}

if (false === $preg) {
throw new Compiler\Exception\InternalError(
sprintf(
'Lexer encountered a PCRE error (code: %d), full regex: "%s".',
preg_last_error(),
$_regex
)
);
}

if ('' === $matches[0]) {
throw new Compiler\Exception\Lexer(
'A lexeme must not match an empty value, which is the ' .
Expand All @@ -300,4 +313,17 @@ protected function matchLexeme($lexeme, $regex, $offset)
'length' => mb_strlen($matches[0])
];
}

/**
* @param string $text
* @return bool
*/
private function validateInputInUnicodeMode($text)
{
if (strpos($this->_pcreOptions, 'u') !== false && !mb_check_encoding($text, 'utf-8')) {
throw new Compiler\Exception\Lexer(
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
);
}
}
}
23 changes: 23 additions & 0 deletions Test/Unit/Llk/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -496,4 +496,27 @@ public function case_unicode_disabled()
''
);
}

public function case_invalid_utf8_with_unicode_mode()
{
$this
->given(
$lexer = new SUT(['lexer.unicode' => true]),
$datum = "\xFF",
$tokens = [
'default' => [
'foo' => "\xFF"
]
]
)
->when($result = $lexer->lexMe($datum, $tokens))
->then
->exception(function () use ($result) {
$result->next();
})
->isInstanceOf(LUT\Exception\Lexer::class)
->hasMessage(
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
);
}
}

0 comments on commit 05740f1

Please sign in to comment.