Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

data: update to UCD 16 #1229

Merged
merged 2 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,15 @@ impl Config {
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::nfa::thompson::NFA;
///
/// // 300KB isn't enough!
/// // 400KB isn't enough!
/// NFA::compiler()
/// .configure(NFA::config().nfa_size_limit(Some(300_000)))
/// .configure(NFA::config().nfa_size_limit(Some(400_000)))
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 400KB probably is.
/// // ... but 500KB probably is.
/// let nfa = NFA::compiler()
/// .configure(NFA::config().nfa_size_limit(Some(400_000)))
/// .configure(NFA::config().nfa_size_limit(Some(500_000)))
/// .build(r"\w{20}")?;
///
/// assert_eq!(nfa.pattern_len(), 1);
Expand Down
65 changes: 45 additions & 20 deletions regex-automata/src/util/unicode_data/perl_word.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate perl-word tmp/ucd-15.0.0/ --chars
// ucd-generate perl-word ucd-16.0.0 --chars
//
// Unicode version: 15.0.0.
// Unicode version: 16.0.0.
//
// ucd-generate 0.2.15 is available on crates.io.
// ucd-generate 0.3.1 is available on crates.io.

pub const PERL_WORD: &'static [(char, char)] = &[
('0', '9'),
Expand Down Expand Up @@ -59,7 +59,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('ࡠ', 'ࡪ'),
('ࡰ', 'ࢇ'),
('ࢉ', 'ࢎ'),
('\u{898}', '\u{8e1}'),
('\u{897}', '\u{8e1}'),
('\u{8e3}', '\u{963}'),
('०', '९'),
('ॱ', 'ঃ'),
Expand Down Expand Up @@ -158,8 +158,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('ಪ', 'ಳ'),
('ವ', 'ಹ'),
('\u{cbc}', 'ೄ'),
('\u{cc6}', ''),
('', '\u{ccd}'),
('\u{cc6}', '\u{cc8}'),
('\u{cca}', '\u{ccd}'),
('\u{cd5}', '\u{cd6}'),
('ೝ', 'ೞ'),
('ೠ', '\u{ce3}'),
Expand Down Expand Up @@ -243,8 +243,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('ᚁ', 'ᚚ'),
('ᚠ', 'ᛪ'),
('ᛮ', 'ᛸ'),
('ᜀ', ''),
('ᜟ', ''),
('ᜀ', '\u{1715}'),
('ᜟ', '\u{1734}'),
('ᝀ', '\u{1753}'),
('ᝠ', 'ᝬ'),
('ᝮ', 'ᝰ'),
Expand Down Expand Up @@ -276,11 +276,11 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('\u{1b00}', 'ᭌ'),
('᭐', '᭙'),
('\u{1b6b}', '\u{1b73}'),
('\u{1b80}', ''),
('\u{1b80}', '\u{1bf3}'),
('ᰀ', '\u{1c37}'),
('᱀', '᱉'),
('ᱍ', 'ᱽ'),
('ᲀ', ''),
('ᲀ', ''),
('Ა', 'Ჺ'),
('Ჽ', 'Ჿ'),
('\u{1cd0}', '\u{1cd2}'),
Expand Down Expand Up @@ -367,10 +367,10 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('ꙿ', '\u{a6f1}'),
('ꜗ', 'ꜟ'),
('Ꜣ', 'ꞈ'),
('Ꞌ', ''),
('Ꞌ', ''),
('Ꟑ', 'ꟑ'),
('ꟓ', 'ꟓ'),
('ꟕ', ''),
('ꟕ', ''),
('ꟲ', 'ꠧ'),
('\u{a82c}', '\u{a82c}'),
('ꡀ', 'ꡳ'),
Expand All @@ -379,9 +379,9 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('\u{a8e0}', 'ꣷ'),
('ꣻ', 'ꣻ'),
('ꣽ', '\u{a92d}'),
('ꤰ', ''),
('ꤰ', '\u{a953}'),
('ꥠ', 'ꥼ'),
('\u{a980}', ''),
('\u{a980}', '\u{a9c0}'),
('ꧏ', '꧙'),
('ꧠ', 'ꧾ'),
('ꨀ', '\u{aa36}'),
Expand Down Expand Up @@ -468,6 +468,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𐖣', '𐖱'),
('𐖳', '𐖹'),
('𐖻', '𐖼'),
('𐗀', '𐗳'),
('𐘀', '𐜶'),
('𐝀', '𐝕'),
('𐝠', '𐝧'),
Expand Down Expand Up @@ -508,10 +509,14 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𐳀', '𐳲'),
('𐴀', '\u{10d27}'),
('𐴰', '𐴹'),
('𐵀', '𐵥'),
('\u{10d69}', '\u{10d6d}'),
('𐵯', '𐶅'),
('𐺀', '𐺩'),
('\u{10eab}', '\u{10eac}'),
('𐺰', '𐺱'),
('\u{10efd}', '𐼜'),
('𐻂', '𐻄'),
('\u{10efc}', '𐼜'),
('𐼧', '𐼧'),
('𐼰', '\u{10f50}'),
('𐽰', '\u{10f85}'),
Expand Down Expand Up @@ -551,12 +556,22 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𑌵', '𑌹'),
('\u{1133b}', '𑍄'),
('𑍇', '𑍈'),
('𑍋', '𑍍'),
('𑍋', '\u{1134d}'),
('𑍐', '𑍐'),
('\u{11357}', '\u{11357}'),
('𑍝', '𑍣'),
('\u{11366}', '\u{1136c}'),
('\u{11370}', '\u{11374}'),
('𑎀', '𑎉'),
('𑎋', '𑎋'),
('𑎎', '𑎎'),
('𑎐', '𑎵'),
('𑎷', '\u{113c0}'),
('\u{113c2}', '\u{113c2}'),
('\u{113c5}', '\u{113c5}'),
('\u{113c7}', '𑏊'),
('𑏌', '𑏓'),
('\u{113e1}', '\u{113e2}'),
('𑐀', '𑑊'),
('𑑐', '𑑙'),
('\u{1145e}', '𑑡'),
Expand All @@ -571,6 +586,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𑙐', '𑙙'),
('𑚀', '𑚸'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜀', '𑜚'),
('\u{1171d}', '\u{1172b}'),
('𑜰', '𑜹'),
Expand All @@ -594,6 +610,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𑩐', '\u{11a99}'),
('𑪝', '𑪝'),
('𑪰', '𑫸'),
('𑯀', '𑯠'),
('𑯰', '𑯹'),
('𑰀', '𑰈'),
('𑰊', '\u{11c36}'),
('\u{11c38}', '𑱀'),
Expand All @@ -618,15 +636,17 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('\u{11f00}', '𑼐'),
('𑼒', '\u{11f3a}'),
('𑼾', '\u{11f42}'),
('𑽐', '𑽙'),
('𑽐', '\u{11f5a}'),
('𑾰', '𑾰'),
('𒀀', '𒎙'),
('𒐀', '𒑮'),
('𒒀', '𒕃'),
('𒾐', '𒿰'),
('𓀀', '𓐯'),
('\u{13440}', '\u{13455}'),
('𓑠', '𔏺'),
('𔐀', '𔙆'),
('𖄀', '𖄹'),
('𖠀', '𖨸'),
('𖩀', '𖩞'),
('𖩠', '𖩩'),
Expand All @@ -639,16 +659,18 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𖭐', '𖭙'),
('𖭣', '𖭷'),
('𖭽', '𖮏'),
('𖵀', '𖵬'),
('𖵰', '𖵹'),
('𖹀', '𖹿'),
('𖼀', '𖽊'),
('\u{16f4f}', '𖾇'),
('\u{16f8f}', '𖾟'),
('𖿠', '𖿡'),
('𖿣', '\u{16fe4}'),
('𖿰', '𖿱'),
('\u{16ff0}', '\u{16ff1}'),
('𗀀', '𘟷'),
('𘠀', '𘳕'),
('𘴀', '𘴈'),
('𘳿', '𘴈'),
('𚿰', '𚿳'),
('𚿵', '𚿻'),
('𚿽', '𚿾'),
Expand All @@ -663,10 +685,11 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𛲀', '𛲈'),
('𛲐', '𛲙'),
('\u{1bc9d}', '\u{1bc9e}'),
('𜳰', '𜳹'),
('\u{1cf00}', '\u{1cf2d}'),
('\u{1cf30}', '\u{1cf46}'),
('\u{1d165}', '\u{1d169}'),
('𝅭', '\u{1d172}'),
('\u{1d16d}', '\u{1d172}'),
('\u{1d17b}', '\u{1d182}'),
('\u{1d185}', '\u{1d18b}'),
('\u{1d1aa}', '\u{1d1ad}'),
Expand Down Expand Up @@ -724,6 +747,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𞊐', '\u{1e2ae}'),
('𞋀', '𞋹'),
('𞓐', '𞓹'),
('𞗐', '𞗺'),
('𞟠', '𞟦'),
('𞟨', '𞟫'),
('𞟭', '𞟮'),
Expand Down Expand Up @@ -774,6 +798,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('𮯰', '𮹝'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
Expand Down
17 changes: 17 additions & 0 deletions regex-cli/cmd/generate/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,23 @@ USAGE:
gen(d.join("sentence_break.rs"), &["sentence-break", &ucd, "--chars"])?;
gen(d.join("word_break.rs"), &["word-break", &ucd, "--chars"])?;

// These generate the \w, \d and \s Unicode-aware character classes for
// regex-syntax. \d and \s are technically part of the general category
// and boolean properties generated above. However, these are generated
// separately to make it possible to enable or disable them via Cargo
// features independently of whether all boolean properties or general
// categories are enabled or disabled. The crate ensures that only one copy
// is compiled.
gen(d.join("perl_word.rs"), &["perl-word", &ucd, "--chars"])?;
gen(
d.join("perl_decimal.rs"),
&["general-category", &ucd, "--chars", "--include", "decimalnumber"],
)?;
gen(
d.join("perl_space.rs"),
&["property-bool", &ucd, "--chars", "--include", "whitespace"],
)?;

// Data tables for regex-automata.
let d = out
.join("regex-automata")
Expand Down
21 changes: 21 additions & 0 deletions regex-syntax/src/hir/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3143,10 +3143,31 @@ mod tests {
#[cfg(feature = "unicode-script")]
assert_eq!(
t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
// Class({
// '·'..='·',
// '\u{300}'..='\u{301}',
// '\u{304}'..='\u{304}',
// '\u{306}'..='\u{306}',
// '\u{308}'..='\u{308}',
// '\u{313}'..='\u{313}',
// '\u{342}'..='\u{342}',
// '\u{345}'..='\u{345}',
// 'ʹ'..='ʹ',
// '\u{1dc0}'..='\u{1dc1}',
// '⁝'..='⁝',
// })
hir_uclass(&[
('·', '·'),
('\u{0300}', '\u{0301}'),
('\u{0304}', '\u{0304}'),
('\u{0306}', '\u{0306}'),
('\u{0308}', '\u{0308}'),
('\u{0313}', '\u{0313}'),
('\u{0342}', '\u{0342}'),
('\u{0345}', '\u{0345}'),
('ʹ', 'ʹ'),
('\u{1DC0}', '\u{1DC1}'),
('⁝', '⁝'),
])
);
assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
Expand Down
2 changes: 2 additions & 0 deletions regex-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,8 @@ fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
("V13_0", age::V13_0),
("V14_0", age::V14_0),
("V15_0", age::V15_0),
("V15_1", age::V15_1),
("V16_0", age::V16_0),
];
assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");

Expand Down
Loading