Skip to content

Commit

Permalink
automata: fix more out-dated regex-cli commands
Browse files Browse the repository at this point in the history
That should cover all of them.

Closes #1053
  • Loading branch information
BurntSushi committed Oct 6, 2023
1 parent 4e873e2 commit 3f36a63
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 26 deletions.
13 changes: 7 additions & 6 deletions regex-automata/src/dfa/accel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
// DFA with regex-cli:
//
// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
// dense::DFA(
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
// D 000000:
// Q 000001:
// *000002:
// A 000003: \x00-` => 3, a => 5, b-\xFF => 3
// >000004: \x00-` => 3, a => 4, b-\xFF => 3
// 000005: \x00-\xFF => 2, EOI => 2
// )
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
// 000005: \x00-` => 4, b-\xFF => 4
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
// 000007: \x00-\xFF => 2, EOI => 2
// 000008: \x00-\xFF => 2, EOI => 2
//
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
// the only way to leave that state once entered is to see an 'a' byte. If
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/dfa/automaton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1132,7 +1132,7 @@ pub unsafe trait Automaton {
/// // implementation defined.
/// //
/// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
/// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
/// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`.
/// let id = StateID::new(3 * dfa.stride()).unwrap();
/// let accelerator = dfa.accelerator(id);
/// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
Expand Down
5 changes: 3 additions & 2 deletions regex-automata/src/dfa/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1228,8 +1228,9 @@ impl Builder {
} else {
let mut set = nfa.byte_class_set().clone();
// It is important to distinguish any "quit" bytes from all other
// bytes. Otherwise, a non-quit byte may end up in the same class
// as a quit byte, and thus cause the DFA stop when it shouldn't.
// bytes. Otherwise, a non-quit byte may end up in the same
// class as a quit byte, and thus cause the DFA to stop when it
// shouldn't.
//
// Test case:
//
Expand Down
10 changes: 6 additions & 4 deletions regex-automata/src/hybrid/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2103,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Here's an example that justifies 'inline(never)'
///
/// ```ignore
/// regex-cli find hybrid dfa \
/// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
/// regex-cli find match hybrid \
/// --cache-capacity 100000000 \
/// -p '\pL{100}'
/// all-codepoints-utf8-100x
/// ```
///
/// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
Expand Down Expand Up @@ -3830,8 +3832,8 @@ impl Config {
//
// Test case:
//
// regex-cli find hybrid regex -w @conn.json.1000x.log \
// '^#' '\b10\.55\.182\.100\b'
// regex-cli find match hybrid --unicode-word-boundary \
// -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
if !quit.is_empty() {
set.add_set(&quit);
}
Expand Down
10 changes: 5 additions & 5 deletions regex-automata/src/hybrid/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,14 @@ fn find_fwd_imp(
// PERF: For justification of omitting bounds checks, it gives us a
// ~10% bump in search time. This was used for a benchmark:
//
// regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
//
// PERF: For justification for the loop unrolling, we use a few
// different tests:
//
// regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
// regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
// regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
// regex-cli find half hybrid -p '\w{50}' -UBb bigfile
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
// regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
//
// And there are three different configurations:
//
Expand Down Expand Up @@ -353,7 +353,7 @@ fn find_rev_imp(
// anchored and on shorter haystacks. However, this still makes a
// difference. Take this command for example:
//
// regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
// regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
//
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
// like in the justification for the forward direction. The 'regex'
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1466,7 +1466,7 @@ impl Compiler {
// compare and contrast performance of the Pike VM when the code below
// is active vs the code above. Here's an example to try:
//
// regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
// regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
//
// With Unicode classes generated below, this search takes about 45s on
// my machine. But with the compressed version above, the search takes
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/nfa/thompson/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
Expand Down
6 changes: 2 additions & 4 deletions regex-automata/src/nfa/thompson/nfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1841,14 +1841,12 @@ impl SparseTransitions {
// This is an alternative implementation that uses binary search. In
// some ad hoc experiments, like
//
// smallishru=OpenSubtitles2018.raw.sample.smallish.ru
// regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
// regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
//
// I could not observe any improvement, and in fact, things seemed to
// be a bit slower. I can see an improvement in at least one benchmark:
//
// allcpssmall=all-codepoints-utf8-10x
// regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
// regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
//
// Where total search time goes from 3.2s to 2.4s when using binary
// search.
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/nfa/thompson/range_trie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ impl State {
// Benchmarks suggest that binary search is just a bit faster than
// straight linear search. Specifically when using the debug tool:
//
// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
binary_search(&self.transitions, |t| range.start <= t.range.end)
}

Expand Down
4 changes: 3 additions & 1 deletion regex-automata/src/util/look.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError {
// There are perhaps other choices as well. Why did I stop at these 4? Because
// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
// approach eventually, as the benefits of the DFA approach are somewhat
// compelling. The 'boundary-words-holmes' benchmark tests this:
// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
// the commands below no longer work. If necessary, we should re-capitulate
// the benchmark from whole cloth in rebar.)
//
// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
//
Expand Down

0 comments on commit 3f36a63

Please sign in to comment.