diff --git a/custom-parser/parser/DynamicRecursiveDescentParser.php b/custom-parser/parser/DynamicRecursiveDescentParser.php deleted file mode 100644 index 403afbb5..00000000 --- a/custom-parser/parser/DynamicRecursiveDescentParser.php +++ /dev/null @@ -1,448 +0,0 @@ -get_next_token(); - $tokens[] = $token; - } while ( MySQLLexer::EOF !== $token->type ); - return $tokens; -} -class Grammar { - - public $rules; - public $rule_names; - public $fragment_ids; - public $lookahead_is_match_possible = array(); - public $lowest_non_terminal_id; - public $highest_terminal_id; - - public function __construct( array $rules ) { - $this->inflate( $rules ); - } - - public function get_rule_name( $rule_id ) { - return $this->rule_names[ $rule_id ]; - } - - public function get_rule_id( $rule_name ) { - return array_search( $rule_name, $this->rule_names, true ); - } - - /** - * Grammar is a packed PHP array to minimize the file size. Every - * rule and token is encoded as an integer. It still takes 1.2MB, - * maybe we can do better than that with a more efficient encoding, - * e.g. what Dennis Snell did for the HTML entity decoder. - * Or maybe we can reduce the grammar size by factoring the rules? - * Or perhaps we can let go of some parsing rules that SQLite cannot - * support anyway? - */ - private function inflate( $grammar ) { - $this->lowest_non_terminal_id = $grammar['rules_offset']; - $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; - - foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); - /** - * Treat all intermediate rules as fragments to inline before returning - * the final parse tree to the API consumer. - * - * The original grammar was too difficult to parse with rules like - * - * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) - * - * We've factored rules like bitExpr* to separate rules like bitExpr_zero_or_more. - * This is super useful for parsing, but it limits the API consumer's ability to - * reason about the parse tree. - * - * The following rules as fragments: - * - * * Rules starting with a percent sign ("%") – these are intermediate - * rules that are not part of the original grammar. They are useful - * - */ - if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; - } - } - - $this->rules = array(); - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - $this->rules[ $rule_id ] = $branches; - } - - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bale out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whooping 80%! - * - * The next step could be to: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { - continue; - } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; - foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); - } - - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; - break; - } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; - } - } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; - } - } - } - } -} - -class StackFrame { - public $rule_id; - public $starting_position = 0; - public $position = 0; - public $branch_index = 0; - public $subrule_index = 0; - public $match = array(); - public $child_frame; -} - -class ParseTree { - public $rule_id; - public $rule_name; - public $children = array(); - - public function __construct( $rule_id, $rule_name ) { - $this->rule_id = $rule_id; - $this->rule_name = $rule_name; - } - - public function append_child( $node ) { - $this->children[] = $node; - } - - /** - * Flatten the matched rule fragments as if their children were direct - * descendants of the current rule. - * - * What are rule fragments? - * - * When we initially parse the BNF grammar file, it has compound rules such - * as this one: - * - * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) - * - * Building a parser that can understand such rules is way more complex than building - * a parser that only follows simple rules, so we flatten those compound rules into - * simpler ones. The above rule would be flattened to: - * - * query ::= EOF | %query0 - * %query0 ::= %%query01 %%query02 - * %%query01 ::= simpleStatement | beginWork - * %%query02 ::= SEMICOLON_SYMBOL EOF_zero_or_one | EOF - * EOF_zero_or_one ::= EOF | ε - * - * This factorization happens in 1-ebnf-to-json.js. - * - * "Fragments" are intermediate artifacts whose names are not in the original grammar. - * They are extremely useful for the parser, but the API consumer should never have to - * worry about them. Fragment names start with a percent sign ("%"). - * - * The code below inlines every fragment back in its parent rule. - * - * We could optimize this. The current $match may be discarded later on so any inlining - * effort here would be wasted. However, inlining seems cheap and doing it bottom-up here - * is **much** easier than reprocessing the parse tree top-down later on. - * - * The following parse tree: - * - * [ - * 'query' => [ - * [ - * '%query01' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ], - * '%query02' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * ] - * ] - * ] - * ] - * ] - * - * Would be inlined as: - * - * [ - * 'query' => [ - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ], - * [ - * 'simpleStatement' => [ - * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') - * ] - * ] - * ] - * ] - */ - public function merge_fragment( $node ) { - $this->children = array_merge( $this->children, $node->children ); - } - - public function has_child( $rule_name ) { - foreach ( $this->children as $child ) { - if ( ( $child instanceof ParseTree && $child->rule_name === $rule_name ) ) { - return true; - } - } - return false; - } - - public function has_token( $token_id = null ) { - foreach ( $this->children as $child ) { - if ( $child instanceof MySQLToken && ( - null === $token_id || - $child->type === $token_id - ) ) { - return true; - } - } - return false; - } - - public function get_token( $token_id = null ) { - foreach ( $this->children as $child ) { - if ( $child instanceof MySQLToken && ( - null === $token_id || - $child->type === $token_id - ) ) { - return $child; - } - } - return null; - } - - public function get_child( $rule_name = null ) { - foreach ( $this->children as $child ) { - if ( $child instanceof ParseTree && ( - $child->rule_name === $rule_name || - null === $rule_name - ) ) { - return $child; - } - } - } - - public function get_descendant( $rule_name ) { - $parse_trees = array( $this ); - while ( count( $parse_trees ) ) { - $parse_tree = array_pop( $parse_trees ); - if ( $parse_tree->rule_name === $rule_name ) { - return $parse_tree; - } - array_push( $parse_trees, ...$parse_tree->get_children() ); - } - return null; - } - - public function get_descendants( $rule_name ) { - $parse_trees = array( $this ); - $all_descendants = array(); - while ( count( $parse_trees ) ) { - $parse_tree = array_pop( $parse_trees ); - $all_descendants = array_merge( $all_descendants, $parse_tree->get_children( $rule_name ) ); - array_push( $parse_trees, ...$parse_tree->get_children() ); - } - return $all_descendants; - } - - public function get_children( $rule_name = null ) { - $matches = array(); - foreach ( $this->children as $child ) { - if ( $child instanceof ParseTree && ( - null === $rule_name || - $child->rule_name === $rule_name - ) ) { - $matches[] = $child; - } - } - return $matches; - } -} - -class DynamicRecursiveDescentParser { - private $tokens; - private $position; - private $grammar; - - public function __construct( Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; - } - - public function parse() { - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - return $this->parse_recursive( $query_rule_id ); - } - - - private function parse_recursive( $rule_id ) { - //var_dump($this->get_rule_name($rule_id)); - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - // Inlining a $this->match($rule_id) call here speeds the - // parser up by a whooping 10%! - if ( $this->position >= count( $this->tokens ) ) { - return null; - } - - if ( MySQLLexer::EMPTY_TOKEN === $rule_id ) { - return true; - } - - if ( $this->tokens[ $this->position ]->type === $rule_id ) { - ++$this->position; - return $this->tokens[ $this->position - 1 ]; - } - return null; - } - - $rule = $this->grammar->rules[ $rule_id ]; - if ( ! count( $rule ) ) { - return null; - } - - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { - $token_id = $this->tokens[ $this->position ]->type; - if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ MySQLLexer::EMPTY_TOKEN ] ) - ) { - return null; - } - } - - $rule_name = str_replace( - array( '_zero_or_one', '_zero_or_more', '_one_or_more', '_rr', '_nested' ), - '', - $this->grammar->rule_names[ $rule_id ] - ); - - //var_dump($this->get_rule_name($rule_id)); - - $starting_position = $this->position; - foreach ( $rule as $branch ) { - $this->position = $starting_position; - $node = new ParseTree( $rule_id, $rule_name ); - $branch_matches = true; - foreach ( $branch as $subrule_id ) { - $subnode = $this->parse_recursive( $subrule_id ); - if ( null === $subnode ) { - $branch_matches = false; - break; - } elseif ( true === $subnode ) { - // ε – the rule matched without actually matching a token. - // Proceed without adding anything to $match. - continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; - } - if ( is_array( $subnode ) && ! count( $subnode ) ) { - continue; - } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); - } else { - $node->append_child( $subnode ); - } - } - - // Negative lookahead for INTO after a valid SELECT statement. - // If we match a SELECT statement, but there is an INTO keyword after it, - // we're in the wrong branch and need to leave matching to a later rule. - // For now, it's hard-coded, but we could extract it to a lookahead table. - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && MySQLLexer::INTO_SYMBOL === $la->type ) { - $branch_matches = false; - } - - if ( true === $branch_matches ) { - break; - } - } - - if ( ! $branch_matches ) { - $this->position = $starting_position; - return null; - } - - if ( 0 === count( $node->children ) ) { - return true; - } - - return $node; - } - - private function get_rule_name( $id ) { - if ( $id <= $this->grammar->highest_terminal_id ) { - return MySQLLexer::get_token_name( $id ); - } - - return $this->grammar->get_rule_name( $id ); - } -} diff --git a/custom-parser/grammar-factoring/MySQLParser.g4 b/grammar-tools/MySQLParser.g4 similarity index 100% rename from custom-parser/grammar-factoring/MySQLParser.g4 rename to grammar-tools/MySQLParser.g4 diff --git a/custom-parser/grammar-factoring/convert-grammar.php b/grammar-tools/convert-grammar.php similarity index 97% rename from custom-parser/grammar-factoring/convert-grammar.php rename to grammar-tools/convert-grammar.php index 5ece8228..caf0173a 100644 --- a/custom-parser/grammar-factoring/convert-grammar.php +++ b/grammar-tools/convert-grammar.php @@ -1,8 +1,8 @@ $name ) { $is_terminal = ! isset( $rule_id_by_name[ $name ] ); if ( $is_terminal ) { - $new_branch[] = MySQLLexer::get_token_id( $name ); + $new_branch[] = MySQL_Lexer::get_token_id( $name ); } else { // Use rule id to avoid conflicts with token ids $new_branch[] = $rule_id_by_name[ $name ]; diff --git a/tests/parser/run-lexer-tests.php b/tests/parser/run-lexer-tests.php index d516b690..2005de85 100644 --- a/tests/parser/run-lexer-tests.php +++ b/tests/parser/run-lexer-tests.php @@ -7,8 +7,7 @@ function ( $severity, $message, $file, $line ) { } ); -require_once __DIR__ . '/../../custom-parser/parser/DynamicRecursiveDescentParser.php'; -require_once __DIR__ . '/../../custom-parser/parser/MySQLLexer.php'; +require_once __DIR__ . '/../../wp-includes/mysql/class-mysql-lexer.php'; $handle = fopen( __DIR__ . '/data/queries.csv', 'r' ); diff --git a/tests/parser/run-parser-tests.php b/tests/parser/run-parser-tests.php index a0033398..7253ff01 100644 --- a/tests/parser/run-parser-tests.php +++ b/tests/parser/run-parser-tests.php @@ -7,6 +7,9 @@ function ( $severity, $message, $file, $line ) { } ); +require_once __DIR__ . '/../../wp-includes/mysql/class-mysql-lexer.php'; +require_once __DIR__ . '/../../wp-includes/parser/class-parser.php'; + function getStats( $total, $failures, $exceptions ) { return sprintf( 'Total: %5d | Failures: %4d / %2d%% | Exceptions: %4d / %2d%%', @@ -18,10 +21,7 @@ function getStats( $total, $failures, $exceptions ) { ); } -require_once __DIR__ . '/../../custom-parser/parser/DynamicRecursiveDescentParser.php'; -require_once __DIR__ . '/../../custom-parser/parser/MySQLLexer.php'; - -$grammar_data = include __DIR__ . '/../../custom-parser/parser/grammar.php'; +$grammar_data = include __DIR__ . '/../../wp-includes/mysql/mysql-grammar.php'; $grammar = new Grammar( $grammar_data ); $handle = fopen( __DIR__ . '/data/queries.csv', 'r' ); @@ -50,7 +50,7 @@ function getStats( $total, $failures, $exceptions ) { throw new Exception( 'Empty tokens' ); } - $parser = new DynamicRecursiveDescentParser( $grammar, $tokens ); + $parser = new Parser( $grammar, $tokens ); $parse_tree = $parser->parse(); if ( null === $parse_tree ) { $failures[] = $query; diff --git a/tests/parser/run-single.php b/tests/parser/run-single.php new file mode 100644 index 00000000..0e2724b6 --- /dev/null +++ b/tests/parser/run-single.php @@ -0,0 +1,48 @@ +parse(); +var_dump( $parse_tree ); diff --git a/custom-parser/parser/SQLiteDriver.php b/wip/SQLiteDriver.php similarity index 99% rename from custom-parser/parser/SQLiteDriver.php rename to wip/SQLiteDriver.php index 1736975e..b5cad10a 100644 --- a/custom-parser/parser/SQLiteDriver.php +++ b/wip/SQLiteDriver.php @@ -1,4 +1,4 @@ -has_found_rows_call = false; $this->last_calc_rows_result = null; - $parser = new DynamicRecursiveDescentParser( $this->grammar, tokenize_query( $query ) ); + $parser = new Parser( $this->grammar, tokenize_query( $query ) ); $parse_tree = $parser->parse(); $expr = $this->translate_query( $parse_tree ); $expr = $this->rewrite_sql_calc_found_rows( $expr ); @@ -126,13 +126,13 @@ private function translate_query( $parse_tree ) { return null; } - if ( $parse_tree instanceof MySQLToken ) { + if ( $parse_tree instanceof MySQL_Token ) { $token = $parse_tree; switch ( $token->type ) { - case MySQLLexer::EOF: + case MySQL_Lexer::EOF: return new SQLiteExpression( array() ); - case MySQLLexer::IDENTIFIER: + case MySQL_Lexer::IDENTIFIER: return new SQLiteExpression( array( SQLiteTokenFactory::identifier( @@ -150,7 +150,7 @@ private function translate_query( $parse_tree ) { } } - if ( ! ( $parse_tree instanceof ParseTree ) ) { + if ( ! ( $parse_tree instanceof Parse_Tree ) ) { throw new Exception( 'translateQuery only accepts MySQLToken and ParseTree instances' ); } @@ -165,14 +165,14 @@ private function translate_query( $parse_tree ) { case 'querySpecOption': $token = $parse_tree->get_token(); switch ( $token->type ) { - case MySQLLexer::ALL_SYMBOL: - case MySQLLexer::DISTINCT_SYMBOL: + case MySQL_Lexer::ALL_SYMBOL: + case MySQL_Lexer::DISTINCT_SYMBOL: return new SQLiteExpression( array( SQLiteTokenFactory::raw( $token->text ), ) ); - case MySQLLexer::SQL_CALC_FOUND_ROWS_SYMBOL: + case MySQL_Lexer::SQL_CALC_FOUND_ROWS_SYMBOL: $this->has_sql_calc_found_rows = true; // Fall through to default. default: @@ -188,7 +188,7 @@ private function translate_query( $parse_tree ) { // FROM DUAL statement, as FROM mytable, DUAL is a syntax // error. if ( - $parse_tree->has_token( MySQLLexer::DUAL_SYMBOL ) && + $parse_tree->has_token( MySQL_Lexer::DUAL_SYMBOL ) && ! $parse_tree->has_child( 'tableReferenceList' ) ) { return null; @@ -272,10 +272,10 @@ private function translate_query( $parse_tree ) { case 'textStringLiteral': return new SQLiteExpression( array( - $parse_tree->has_token( MySQLLexer::DOUBLE_QUOTED_TEXT ) ? - SQLiteTokenFactory::double_quoted_value( $parse_tree->get_token( MySQLLexer::DOUBLE_QUOTED_TEXT )->text ) : false, - $parse_tree->has_token( MySQLLexer::SINGLE_QUOTED_TEXT ) ? - SQLiteTokenFactory::raw( $parse_tree->get_token( MySQLLexer::SINGLE_QUOTED_TEXT )->text ) : false, + $parse_tree->has_token( MySQL_Lexer::DOUBLE_QUOTED_TEXT ) ? + SQLiteTokenFactory::double_quoted_value( $parse_tree->get_token( MySQL_Lexer::DOUBLE_QUOTED_TEXT )->text ) : false, + $parse_tree->has_token( MySQL_Lexer::SINGLE_QUOTED_TEXT ) ? + SQLiteTokenFactory::raw( $parse_tree->get_token( MySQL_Lexer::SINGLE_QUOTED_TEXT )->text ) : false, ) ); diff --git a/custom-parser/parser/MySQLLexer.php b/wp-includes/mysql/class-mysql-lexer.php similarity index 99% rename from custom-parser/parser/MySQLLexer.php rename to wp-includes/mysql/class-mysql-lexer.php index e487169d..a0c43f00 100644 --- a/custom-parser/parser/MySQLLexer.php +++ b/wp-includes/mysql/class-mysql-lexer.php @@ -1,5 +1,7 @@ match_eof(); - $this->token_instance = new MySQLToken( self::EOF, '' ); + $this->token_instance = new MySQL_Token( self::EOF, '' ); return false; } else { $this->consume(); $this->type = self::INVALID_INPUT; } - $this->token_instance = null === $this->type ? null : new MySQLToken( $this->type, $this->text, $this->channel ); + $this->token_instance = null === $this->type ? null : new MySQL_Token( $this->type, $this->text, $this->channel ); return true; } @@ -2476,7 +2478,7 @@ protected function identifier_or_keyword() { } // With "SQL_MODE_HIGH_NOT_PRECEDENCE" enabled, "NOT" needs to be emitted as a higher priority NOT2 symbol. - if ( self::NOT_SYMBOL === $this->type && $this->is_sql_mode_active( MySQLLexer::SQL_MODE_HIGH_NOT_PRECEDENCE ) ) { + if ( self::NOT_SYMBOL === $this->type && $this->is_sql_mode_active( MySQL_Lexer::SQL_MODE_HIGH_NOT_PRECEDENCE ) ) { $this->type = self::NOT2_SYMBOL; } @@ -2739,46 +2741,12 @@ private function determine_numeric_type( $text ) { } } -class MySQLToken { - public $type; - public $text; - public $channel; - - public function __construct( $type, $text, $channel = null ) { - $this->type = $type; - $this->text = $text; - $this->channel = $channel; - } - - public function get_type() { - return $this->type; - } - - public function get_name() { - return MySQLLexer::get_token_name( $this->type ); - } - - public function get_text() { - return $this->text; - } - - public function get_channel() { - return $this->channel; - } - - public function __toString() { - return $this->text . '<' . $this->type . ',' . $this->get_name() . '>'; - } - - public function extract_value() { - if ( MySQLLexer::BACK_TICK_QUOTED_ID === $this->type ) { - return substr( $this->text, 1, -1 ); - } elseif ( MySQLLexer::DOUBLE_QUOTED_TEXT === $this->type ) { - return substr( $this->text, 1, -1 ); - } elseif ( MySQLLexer::SINGLE_QUOTED_TEXT === $this->type ) { - return substr( $this->text, 1, -1 ); - } else { - return $this->text; - } - } +function tokenize_query( $sql ) { + $lexer = new MySQL_Lexer( $sql ); + $tokens = array(); + do { + $token = $lexer->get_next_token(); + $tokens[] = $token; + } while ( MySQL_Lexer::EOF !== $token->type ); + return $tokens; } diff --git a/wp-includes/mysql/class-mysql-token.php b/wp-includes/mysql/class-mysql-token.php new file mode 100644 index 00000000..c2f01330 --- /dev/null +++ b/wp-includes/mysql/class-mysql-token.php @@ -0,0 +1,45 @@ +type = $type; + $this->text = $text; + $this->channel = $channel; + } + + public function get_type() { + return $this->type; + } + + public function get_name() { + return MySQL_Lexer::get_token_name( $this->type ); + } + + public function get_text() { + return $this->text; + } + + public function get_channel() { + return $this->channel; + } + + public function __toString() { + return $this->text . '<' . $this->type . ',' . $this->get_name() . '>'; + } + + public function extract_value() { + if ( MySQL_Lexer::BACK_TICK_QUOTED_ID === $this->type ) { + return substr( $this->text, 1, -1 ); + } elseif ( MySQL_Lexer::DOUBLE_QUOTED_TEXT === $this->type ) { + return substr( $this->text, 1, -1 ); + } elseif ( MySQL_Lexer::SINGLE_QUOTED_TEXT === $this->type ) { + return substr( $this->text, 1, -1 ); + } else { + return $this->text; + } + } +} diff --git a/custom-parser/parser/grammar.php b/wp-includes/mysql/mysql-grammar.php similarity index 100% rename from custom-parser/parser/grammar.php rename to wp-includes/mysql/mysql-grammar.php diff --git a/wp-includes/parser/class-grammar.php b/wp-includes/parser/class-grammar.php new file mode 100644 index 00000000..d8da55e6 --- /dev/null +++ b/wp-includes/parser/class-grammar.php @@ -0,0 +1,119 @@ +inflate( $rules ); + } + + public function get_rule_name( $rule_id ) { + return $this->rule_names[ $rule_id ]; + } + + public function get_rule_id( $rule_name ) { + return array_search( $rule_name, $this->rule_names, true ); + } + + /** + * Grammar is a packed PHP array to minimize the file size. Every + * rule and token is encoded as an integer. It still takes 1.2MB, + * maybe we can do better than that with a more efficient encoding, + * e.g. what Dennis Snell did for the HTML entity decoder. + * Or maybe we can reduce the grammar size by factoring the rules? + * Or perhaps we can let go of some parsing rules that SQLite cannot + * support anyway? + */ + private function inflate( $grammar ) { + $this->lowest_non_terminal_id = $grammar['rules_offset']; + $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; + + foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { + $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; + $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + /** + * Treat all intermediate rules as fragments to inline before returning + * the final parse tree to the API consumer. + * + * The original grammar was too difficult to parse with rules like + * + * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) + * + * We've factored rules like bitExpr* to separate rules like bitExpr_zero_or_more. + * This is super useful for parsing, but it limits the API consumer's ability to + * reason about the parse tree. + * + * The following rules as fragments: + * + * * Rules starting with a percent sign ("%") – these are intermediate + * rules that are not part of the original grammar. They are useful + * + */ + if ( '%' === $rule_name[0] ) { + $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + } + } + + $this->rules = array(); + foreach ( $grammar['grammar'] as $rule_index => $branches ) { + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rules[ $rule_id ] = $branches; + } + + /** + * Compute a rule => [token => true] lookup table for each rule + * that starts with a terminal OR with another rule that already + * has a lookahead mapping. + * + * This is similar to left-factoring the grammar, even if not quite + * the same. + * + * This enables us to quickly bale out from checking branches that + * cannot possibly match the current token. This increased the parser + * speed by a whooping 80%! + * + * The next step could be to: + * + * * Compute a rule => [token => branch[]] list lookup table and only + * process the branches that have a chance of matching the current token. + * * Actually left-factor the grammar as much as possible. This, however, + * could inflate the serialized grammar size. + */ + // 5 iterations seem to give us all the speed gains we can get from this. + for ( $i = 0; $i < 5; $i++ ) { + foreach ( $grammar['grammar'] as $rule_index => $branches ) { + $rule_id = $rule_index + $grammar['rules_offset']; + if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + continue; + } + $rule_lookup = array(); + $first_symbol_can_be_expanded_to_all_terminals = true; + foreach ( $branches as $branch ) { + $terminals = false; + $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; + if ( $branch_starts_with_terminal ) { + $terminals = array( $branch[0] ); + } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { + $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + } + + if ( false === $terminals ) { + $first_symbol_can_be_expanded_to_all_terminals = false; + break; + } + foreach ( $terminals as $terminal ) { + $rule_lookup[ $terminal ] = true; + } + } + if ( $first_symbol_can_be_expanded_to_all_terminals ) { + $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + } + } + } + } +} diff --git a/wp-includes/parser/class-parse-tree.php b/wp-includes/parser/class-parse-tree.php new file mode 100644 index 00000000..5e338212 --- /dev/null +++ b/wp-includes/parser/class-parse-tree.php @@ -0,0 +1,172 @@ +rule_id = $rule_id; + $this->rule_name = $rule_name; + } + + public function append_child( $node ) { + $this->children[] = $node; + } + + /** + * Flatten the matched rule fragments as if their children were direct + * descendants of the current rule. + * + * What are rule fragments? + * + * When we initially parse the BNF grammar file, it has compound rules such + * as this one: + * + * query ::= EOF | ((simpleStatement | beginWork) ((SEMICOLON_SYMBOL EOF?) | EOF)) + * + * Building a parser that can understand such rules is way more complex than building + * a parser that only follows simple rules, so we flatten those compound rules into + * simpler ones. The above rule would be flattened to: + * + * query ::= EOF | %query0 + * %query0 ::= %%query01 %%query02 + * %%query01 ::= simpleStatement | beginWork + * %%query02 ::= SEMICOLON_SYMBOL EOF_zero_or_one | EOF + * EOF_zero_or_one ::= EOF | ε + * + * This factorization happens in 1-ebnf-to-json.js. + * + * "Fragments" are intermediate artifacts whose names are not in the original grammar. + * They are extremely useful for the parser, but the API consumer should never have to + * worry about them. Fragment names start with a percent sign ("%"). + * + * The code below inlines every fragment back in its parent rule. + * + * We could optimize this. The current $match may be discarded later on so any inlining + * effort here would be wasted. However, inlining seems cheap and doing it bottom-up here + * is **much** easier than reprocessing the parse tree top-down later on. + * + * The following parse tree: + * + * [ + * 'query' => [ + * [ + * '%query01' => [ + * [ + * 'simpleStatement' => [ + * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') + * ], + * '%query02' => [ + * [ + * 'simpleStatement' => [ + * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') + * ] + * ], + * ] + * ] + * ] + * ] + * ] + * + * Would be inlined as: + * + * [ + * 'query' => [ + * [ + * 'simpleStatement' => [ + * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') + * ] + * ], + * [ + * 'simpleStatement' => [ + * MySQLToken(MySQLLexer::WITH_SYMBOL, 'WITH') + * ] + * ] + * ] + * ] + */ + public function merge_fragment( $node ) { + $this->children = array_merge( $this->children, $node->children ); + } + + public function has_child( $rule_name ) { + foreach ( $this->children as $child ) { + if ( ( $child instanceof Parse_Tree && $child->rule_name === $rule_name ) ) { + return true; + } + } + return false; + } + + public function has_token( $token_id = null ) { + foreach ( $this->children as $child ) { + if ( $child instanceof MySQL_Token && ( + null === $token_id || + $child->type === $token_id + ) ) { + return true; + } + } + return false; + } + + public function get_token( $token_id = null ) { + foreach ( $this->children as $child ) { + if ( $child instanceof MySQL_Token && ( + null === $token_id || + $child->type === $token_id + ) ) { + return $child; + } + } + return null; + } + + public function get_child( $rule_name = null ) { + foreach ( $this->children as $child ) { + if ( $child instanceof Parse_Tree && ( + $child->rule_name === $rule_name || + null === $rule_name + ) ) { + return $child; + } + } + } + + public function get_descendant( $rule_name ) { + $parse_trees = array( $this ); + while ( count( $parse_trees ) ) { + $parse_tree = array_pop( $parse_trees ); + if ( $parse_tree->rule_name === $rule_name ) { + return $parse_tree; + } + array_push( $parse_trees, ...$parse_tree->get_children() ); + } + return null; + } + + public function get_descendants( $rule_name ) { + $parse_trees = array( $this ); + $all_descendants = array(); + while ( count( $parse_trees ) ) { + $parse_tree = array_pop( $parse_trees ); + $all_descendants = array_merge( $all_descendants, $parse_tree->get_children( $rule_name ) ); + array_push( $parse_trees, ...$parse_tree->get_children() ); + } + return $all_descendants; + } + + public function get_children( $rule_name = null ) { + $matches = array(); + foreach ( $this->children as $child ) { + if ( $child instanceof Parse_Tree && ( + null === $rule_name || + $child->rule_name === $rule_name + ) ) { + $matches[] = $child; + } + } + return $matches; + } +} diff --git a/wp-includes/parser/class-parser.php b/wp-includes/parser/class-parser.php new file mode 100644 index 00000000..7fceb654 --- /dev/null +++ b/wp-includes/parser/class-parser.php @@ -0,0 +1,142 @@ +grammar = $grammar; + $this->tokens = $tokens; + $this->position = 0; + } + + public function parse() { + $query_rule_id = $this->grammar->get_rule_id( 'query' ); + return $this->parse_recursive( $query_rule_id ); + } + + + private function parse_recursive( $rule_id ) { + //var_dump($this->get_rule_name($rule_id)); + $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; + if ( $is_terminal ) { + // Inlining a $this->match($rule_id) call here speeds the + // parser up by a whooping 10%! + if ( $this->position >= count( $this->tokens ) ) { + return null; + } + + if ( MySQL_Lexer::EMPTY_TOKEN === $rule_id ) { + return true; + } + + if ( $this->tokens[ $this->position ]->type === $rule_id ) { + ++$this->position; + return $this->tokens[ $this->position - 1 ]; + } + return null; + } + + $rule = $this->grammar->rules[ $rule_id ]; + if ( ! count( $rule ) ) { + return null; + } + + // Bale out from processing the current branch if none of its rules can + // possibly match the current token. + if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { + $token_id = $this->tokens[ $this->position ]->type; + if ( + ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && + ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ MySQL_Lexer::EMPTY_TOKEN ] ) + ) { + return null; + } + } + + $rule_name = str_replace( + array( '_zero_or_one', '_zero_or_more', '_one_or_more', '_rr', '_nested' ), + '', + $this->grammar->rule_names[ $rule_id ] + ); + + //var_dump($this->get_rule_name($rule_id)); + + $starting_position = $this->position; + foreach ( $rule as $branch ) { + $this->position = $starting_position; + $node = new Parse_Tree( $rule_id, $rule_name ); + $branch_matches = true; + foreach ( $branch as $subrule_id ) { + $subnode = $this->parse_recursive( $subrule_id ); + if ( null === $subnode ) { + $branch_matches = false; + break; + } elseif ( true === $subnode ) { + // ε – the rule matched without actually matching a token. + // Proceed without adding anything to $match. + continue; + } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { + continue; + } + if ( is_array( $subnode ) && ! count( $subnode ) ) { + continue; + } + if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { + $node->merge_fragment( $subnode ); + } else { + $node->append_child( $subnode ); + } + } + + // Negative lookahead for INTO after a valid SELECT statement. + // If we match a SELECT statement, but there is an INTO keyword after it, + // we're in the wrong branch and need to leave matching to a later rule. + // For now, it's hard-coded, but we could extract it to a lookahead table. + $la = $this->tokens[ $this->position ] ?? null; + if ( $la && 'selectStatement' === $rule_name && MySQL_Lexer::INTO_SYMBOL === $la->type ) { + $branch_matches = false; + } + + if ( true === $branch_matches ) { + break; + } + } + + if ( ! $branch_matches ) { + $this->position = $starting_position; + return null; + } + + if ( 0 === count( $node->children ) ) { + return true; + } + + return $node; + } + + private function get_rule_name( $id ) { + if ( $id <= $this->grammar->highest_terminal_id ) { + return MySQL_Lexer::get_token_name( $id ); + } + + return $this->grammar->get_rule_name( $id ); + } +}