rust-lang · BurntSushi · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
@@ -230,15 +230,15 @@ impl Config {
     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
     /// use regex_automata::nfa::thompson::NFA;
     ///
-    /// // 300KB isn't enough!
+    /// // 400KB isn't enough!
     /// NFA::compiler()
-    ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
+    ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
     ///     .build(r"\w{20}")
     ///     .unwrap_err();
     ///
-    /// // ... but 400KB probably is.
+    /// // ... but 500KB probably is.
     /// let nfa = NFA::compiler()
-    ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
+    ///     .configure(NFA::config().nfa_size_limit(Some(500_000)))
     ///     .build(r"\w{20}")?;
     ///
     /// assert_eq!(nfa.pattern_len(), 1);

diff --git a/regex-automata/src/util/unicode_data/perl_word.rs b/regex-automata/src/util/unicode_data/perl_word.rs
@@ -1,10 +1,10 @@
 // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
 //
-//   ucd-generate perl-word tmp/ucd-15.0.0/ --chars
+//   ucd-generate perl-word ucd-16.0.0 --chars
 //
-// Unicode version: 15.0.0.
+// Unicode version: 16.0.0.
 //
-// ucd-generate 0.2.15 is available on crates.io.
+// ucd-generate 0.3.1 is available on crates.io.
 
 pub const PERL_WORD: &'static [(char, char)] = &[
     ('0', '9'),
@@ -59,7 +59,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('ࡠ', 'ࡪ'),
     ('ࡰ', 'ࢇ'),
     ('ࢉ', 'ࢎ'),
-    ('\u{898}', '\u{8e1}'),
+    ('\u{897}', '\u{8e1}'),
     ('\u{8e3}', '\u{963}'),
     ('०', '९'),
     ('ॱ', 'ঃ'),
@@ -158,8 +158,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('ಪ', 'ಳ'),
     ('ವ', 'ಹ'),
     ('\u{cbc}', 'ೄ'),
-    ('\u{cc6}', 'ೈ'),
-    ('ೊ', '\u{ccd}'),
+    ('\u{cc6}', '\u{cc8}'),
+    ('\u{cca}', '\u{ccd}'),
     ('\u{cd5}', '\u{cd6}'),
     ('ೝ', 'ೞ'),
     ('ೠ', '\u{ce3}'),
@@ -243,8 +243,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('ᚁ', 'ᚚ'),
     ('ᚠ', 'ᛪ'),
     ('ᛮ', 'ᛸ'),
-    ('ᜀ', '᜕'),
-    ('ᜟ', '᜴'),
+    ('ᜀ', '\u{1715}'),
+    ('ᜟ', '\u{1734}'),
     ('ᝀ', '\u{1753}'),
     ('ᝠ', 'ᝬ'),
     ('ᝮ', 'ᝰ'),
@@ -276,11 +276,11 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('\u{1b00}', 'ᭌ'),
     ('᭐', '᭙'),
     ('\u{1b6b}', '\u{1b73}'),
-    ('\u{1b80}', '᯳'),
+    ('\u{1b80}', '\u{1bf3}'),
     ('ᰀ', '\u{1c37}'),
     ('᱀', '᱉'),
     ('ᱍ', 'ᱽ'),
-    ('ᲀ', 'ᲈ'),
+    ('ᲀ', 'ᲊ'),
     ('Ა', 'Ჺ'),
     ('Ჽ', 'Ჿ'),
     ('\u{1cd0}', '\u{1cd2}'),
@@ -367,10 +367,10 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('ꙿ', '\u{a6f1}'),
     ('ꜗ', 'ꜟ'),
     ('Ꜣ', 'ꞈ'),
-    ('Ꞌ', 'ꟊ'),
+    ('Ꞌ', 'ꟍ'),
     ('Ꟑ', 'ꟑ'),
     ('ꟓ', 'ꟓ'),
-    ('ꟕ', 'ꟙ'),
+    ('ꟕ', 'Ƛ'),
     ('ꟲ', 'ꠧ'),
     ('\u{a82c}', '\u{a82c}'),
     ('ꡀ', 'ꡳ'),
@@ -379,9 +379,9 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('\u{a8e0}', 'ꣷ'),
     ('ꣻ', 'ꣻ'),
     ('ꣽ', '\u{a92d}'),
-    ('ꤰ', '꥓'),
+    ('ꤰ', '\u{a953}'),
     ('ꥠ', 'ꥼ'),
-    ('\u{a980}', '꧀'),
+    ('\u{a980}', '\u{a9c0}'),
     ('ꧏ', '꧙'),
     ('ꧠ', 'ꧾ'),
     ('ꨀ', '\u{aa36}'),
@@ -468,6 +468,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𐖣', '𐖱'),
     ('𐖳', '𐖹'),
     ('𐖻', '𐖼'),
+    ('𐗀', '𐗳'),
     ('𐘀', '𐜶'),
     ('𐝀', '𐝕'),
     ('𐝠', '𐝧'),
@@ -508,10 +509,14 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𐳀', '𐳲'),
     ('𐴀', '\u{10d27}'),
     ('𐴰', '𐴹'),
+    ('𐵀', '𐵥'),
+    ('\u{10d69}', '\u{10d6d}'),
+    ('𐵯', '𐶅'),
     ('𐺀', '𐺩'),
     ('\u{10eab}', '\u{10eac}'),
     ('𐺰', '𐺱'),
-    ('\u{10efd}', '𐼜'),
+    ('𐻂', '𐻄'),
+    ('\u{10efc}', '𐼜'),
     ('𐼧', '𐼧'),
     ('𐼰', '\u{10f50}'),
     ('𐽰', '\u{10f85}'),
@@ -551,12 +556,22 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𑌵', '𑌹'),
     ('\u{1133b}', '𑍄'),
     ('𑍇', '𑍈'),
-    ('𑍋', '𑍍'),
+    ('𑍋', '\u{1134d}'),
     ('𑍐', '𑍐'),
     ('\u{11357}', '\u{11357}'),
     ('𑍝', '𑍣'),
     ('\u{11366}', '\u{1136c}'),
     ('\u{11370}', '\u{11374}'),
+    ('𑎀', '𑎉'),
+    ('𑎋', '𑎋'),
+    ('𑎎', '𑎎'),
+    ('𑎐', '𑎵'),
+    ('𑎷', '\u{113c0}'),
+    ('\u{113c2}', '\u{113c2}'),
+    ('\u{113c5}', '\u{113c5}'),
+    ('\u{113c7}', '𑏊'),
+    ('𑏌', '𑏓'),
+    ('\u{113e1}', '\u{113e2}'),
     ('𑐀', '𑑊'),
     ('𑑐', '𑑙'),
     ('\u{1145e}', '𑑡'),
@@ -571,6 +586,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𑙐', '𑙙'),
     ('𑚀', '𑚸'),
     ('𑛀', '𑛉'),
+    ('𑛐', '𑛣'),
     ('𑜀', '𑜚'),
     ('\u{1171d}', '\u{1172b}'),
     ('𑜰', '𑜹'),
@@ -594,6 +610,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𑩐', '\u{11a99}'),
     ('𑪝', '𑪝'),
     ('𑪰', '𑫸'),
+    ('𑯀', '𑯠'),
+    ('𑯰', '𑯹'),
     ('𑰀', '𑰈'),
     ('𑰊', '\u{11c36}'),
     ('\u{11c38}', '𑱀'),
@@ -618,15 +636,17 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('\u{11f00}', '𑼐'),
     ('𑼒', '\u{11f3a}'),
     ('𑼾', '\u{11f42}'),
-    ('𑽐', '𑽙'),
+    ('𑽐', '\u{11f5a}'),
     ('𑾰', '𑾰'),
     ('𒀀', '𒎙'),
     ('𒐀', '𒑮'),
     ('𒒀', '𒕃'),
     ('𒾐', '𒿰'),
     ('𓀀', '𓐯'),
     ('\u{13440}', '\u{13455}'),
+    ('𓑠', '𔏺'),
     ('𔐀', '𔙆'),
+    ('𖄀', '𖄹'),
     ('𖠀', '𖨸'),
     ('𖩀', '𖩞'),
     ('𖩠', '𖩩'),
@@ -639,16 +659,18 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𖭐', '𖭙'),
     ('𖭣', '𖭷'),
     ('𖭽', '𖮏'),
+    ('𖵀', '𖵬'),
+    ('𖵰', '𖵹'),
     ('𖹀', '𖹿'),
     ('𖼀', '𖽊'),
     ('\u{16f4f}', '𖾇'),
     ('\u{16f8f}', '𖾟'),
     ('𖿠', '𖿡'),
     ('𖿣', '\u{16fe4}'),
-    ('𖿰', '𖿱'),
+    ('\u{16ff0}', '\u{16ff1}'),
     ('𗀀', '𘟷'),
     ('𘠀', '𘳕'),
-    ('𘴀', '𘴈'),
+    ('𘳿', '𘴈'),
     ('𚿰', '𚿳'),
     ('𚿵', '𚿻'),
     ('𚿽', '𚿾'),
@@ -663,10 +685,11 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𛲀', '𛲈'),
     ('𛲐', '𛲙'),
     ('\u{1bc9d}', '\u{1bc9e}'),
+    ('𜳰', '𜳹'),
     ('\u{1cf00}', '\u{1cf2d}'),
     ('\u{1cf30}', '\u{1cf46}'),
     ('\u{1d165}', '\u{1d169}'),
-    ('𝅭', '\u{1d172}'),
+    ('\u{1d16d}', '\u{1d172}'),
     ('\u{1d17b}', '\u{1d182}'),
     ('\u{1d185}', '\u{1d18b}'),
     ('\u{1d1aa}', '\u{1d1ad}'),
@@ -724,6 +747,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𞊐', '\u{1e2ae}'),
     ('𞋀', '𞋹'),
     ('𞓐', '𞓹'),
+    ('𞗐', '𞗺'),
     ('𞟠', '𞟦'),
     ('𞟨', '𞟫'),
     ('𞟭', '𞟮'),
@@ -774,6 +798,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
     ('𫝀', '𫠝'),
     ('𫠠', '𬺡'),
     ('𬺰', '𮯠'),
+    ('𮯰', '𮹝'),
     ('丽', '𪘀'),
     ('𰀀', '𱍊'),
     ('𱍐', '𲎯'),

diff --git a/regex-cli/cmd/generate/unicode.rs b/regex-cli/cmd/generate/unicode.rs
@@ -84,6 +84,23 @@ USAGE:
     gen(d.join("sentence_break.rs"), &["sentence-break", &ucd, "--chars"])?;
     gen(d.join("word_break.rs"), &["word-break", &ucd, "--chars"])?;
 
+    // These generate the \w, \d and \s Unicode-aware character classes for
+    // regex-syntax. \d and \s are technically part of the general category
+    // and boolean properties generated above. However, these are generated
+    // separately to make it possible to enable or disable them via Cargo
+    // features independently of whether all boolean properties or general
+    // categories are enabled or disabled. The crate ensures that only one copy
+    // is compiled.
+    gen(d.join("perl_word.rs"), &["perl-word", &ucd, "--chars"])?;
+    gen(
+        d.join("perl_decimal.rs"),
+        &["general-category", &ucd, "--chars", "--include", "decimalnumber"],
+    )?;
+    gen(
+        d.join("perl_space.rs"),
+        &["property-bool", &ucd, "--chars", "--include", "whitespace"],
+    )?;
+
     // Data tables for regex-automata.
     let d = out
         .join("regex-automata")

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -3143,10 +3143,31 @@ mod tests {
         #[cfg(feature = "unicode-script")]
         assert_eq!(
             t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
+            // Class({
+            //     '·'..='·',
+            //     '\u{300}'..='\u{301}',
+            //     '\u{304}'..='\u{304}',
+            //     '\u{306}'..='\u{306}',
+            //     '\u{308}'..='\u{308}',
+            //     '\u{313}'..='\u{313}',
+            //     '\u{342}'..='\u{342}',
+            //     '\u{345}'..='\u{345}',
+            //     'ʹ'..='ʹ',
+            //     '\u{1dc0}'..='\u{1dc1}',
+            //     '⁝'..='⁝',
+            // })
             hir_uclass(&[
+                ('·', '·'),
+                ('\u{0300}', '\u{0301}'),
+                ('\u{0304}', '\u{0304}'),
+                ('\u{0306}', '\u{0306}'),
+                ('\u{0308}', '\u{0308}'),
+                ('\u{0313}', '\u{0313}'),
                 ('\u{0342}', '\u{0342}'),
                 ('\u{0345}', '\u{0345}'),
+                ('ʹ', 'ʹ'),
                 ('\u{1DC0}', '\u{1DC1}'),
+                ('⁝', '⁝'),
             ])
         );
         assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));

diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs
@@ -675,6 +675,8 @@ fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
             ("V13_0", age::V13_0),
             ("V14_0", age::V14_0),
             ("V15_0", age::V15_0),
+            ("V15_1", age::V15_1),
+            ("V16_0", age::V16_0),
         ];
         assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");