Skip to content

Commit

Permalink
Merge pull request #825 from googlefonts/get-off-icu-patch
Browse files Browse the repository at this point in the history
[fontbe] Get off patched version of icu_properties
  • Loading branch information
cmyr authored May 21, 2024
2 parents 4484b48 + 0bf527d commit e0b1cb4
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 69 deletions.
6 changes: 0 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,3 @@ members = [
"fea-lsp",
"otl-normalizer",
]

# remove this when icu_properties 1.5 is released:
# https://github.com/unicode-org/icu4x/issues?q=is%3Aopen+is%3Aissue+milestone%3A%221.5+Blocking+⟨P1⟩%22
[patch.crates-io]
icu_properties = { version = "1.4", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" }
tinystr = { version = "0.7.5", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" }
13 changes: 3 additions & 10 deletions fontbe/src/features/kern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ struct KernSplitContext {
/// map of all mark glyphs + whether they are spacing or not
mark_glyphs: HashMap<GlyphId, MarkSpacing>,
glyph_scripts: HashMap<GlyphId, HashSet<UnicodeShortName>>,
bidi_glyphs: HashMap<BidiClass, HashSet<GlyphId>>,
bidi_glyphs: BTreeMap<BidiClass, HashSet<GlyphId>>,
opts: KernSplitOptions,
dflt_scripts: HashSet<UnicodeShortName>,
common_scripts: HashSet<UnicodeShortName>,
Expand Down Expand Up @@ -672,7 +672,7 @@ impl KernSplitContext {
) -> BTreeMap<BTreeSet<UnicodeShortName>, Vec<PendingLookup<PairPosBuilder>>> {
let mut lookups_by_script = BTreeMap::new();
let kerning_per_script = self.split_kerns(pairs);
let mut bidi_buf = HashSet::new(); // we can reuse this for each pair
let mut bidi_buf = BTreeSet::new(); // we can reuse this for each pair
for (scripts, pairs) in kerning_per_script {
let mut builder = PairPosBuilder::default();
for mut pair in pairs {
Expand Down Expand Up @@ -996,14 +996,7 @@ fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet<Unicode
fn scripts_for_chars(glyphs: &impl CharMap) -> HashSet<UnicodeShortName> {
glyphs
.iter_glyphs()
.filter_map(|(_, codepoint)| {
let mut scripts = super::properties::unicode_script_extensions(codepoint);
// only if a codepoint has a single script do know it is supported
match (scripts.next(), scripts.next()) {
(Some(script), None) => Some(script),
_ => None,
}
})
.filter_map(|(_, codepoint)| super::properties::single_script_for_codepoint(codepoint))
.collect()
}

Expand Down
94 changes: 41 additions & 53 deletions fontbe/src/features/properties.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
//! Properties and constants related to unicode data
use std::{
collections::{HashMap, HashSet},
collections::{BTreeMap, HashMap, HashSet},
hash::Hash,
sync::Arc,
};

use fontir::ir::Glyph;
use icu_properties::{script::ScriptWithExtensionsBorrowed, BidiClass, Script};
use icu_properties::{BidiClass, Script};
use tinystr::tinystr;
use write_fonts::{
read::{tables::gsub::Gsub, ReadError},
types::{GlyphId, Tag},
Expand All @@ -16,16 +17,8 @@ use crate::features::ot_tags::{NEW_SCRIPTS, SCRIPT_ALIASES, SCRIPT_EXCEPTIONS_RE

use super::ot_tags::{DFLT_SCRIPT, INDIC_SCRIPTS, NEW_SCRIPT_TAGS, SCRIPT_EXCEPTIONS, USE_SCRIPTS};

// SAFETY: we can visually verify that these inputs contain only non-null ASCII bytes
// (this is the only way we can declare these as constants)
// TODO: remove this if https://github.com/unicode-org/icu4x/pull/4691 is merged/released
pub const COMMON_SCRIPT: UnicodeShortName =
unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zyyy") };
pub const INHERITED_SCRIPT: UnicodeShortName =
unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zinh") };

static SCRIPT_DATA: ScriptWithExtensionsBorrowed<'static> =
icu_properties::script::script_with_extensions();
pub const COMMON_SCRIPT: UnicodeShortName = tinystr!(4, "Zyyy");
pub const INHERITED_SCRIPT: UnicodeShortName = tinystr!(4, "Zinh");

/// The type used by icu4x for script names
pub type UnicodeShortName = tinystr::TinyAsciiStr<4>;
Expand Down Expand Up @@ -89,25 +82,16 @@ impl ScriptDirection {
}
}

/// Iterate over the unicode scripts + extensions for the provided codepoint
///
/// This returns the scripts as shortnames, because that's what python does.
/// It would probably make more sense for us to use the Script type defined by
/// icu4x, but I want to get a more direct port working first.
pub(crate) fn unicode_script_extensions(
c: u32,
) -> impl Iterator<Item = UnicodeShortName> + 'static {
/// Iff a codepoint belongs to a single script, return it.
pub(crate) fn single_script_for_codepoint(cp: u32) -> Option<UnicodeShortName> {
let lookup = Script::enum_to_short_name_mapper();
SCRIPT_DATA
.get_script_extensions_val(c)
.iter()
.map(move |script| {
lookup
.get(script)
// if we get a script it is by definition a 4-char ascii string,
// so this unwrap should never fail
.expect("names should be available for all defined scripts")
})
let data = icu_properties::script::script_with_extensions().get_script_extensions_val(cp);
let mut scripts = data.iter();

match (scripts.next(), scripts.next()) {
(Some(script), None) => lookup.get(script),
_ => None,
}
}

// <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b340c/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L49>
Expand All @@ -124,22 +108,24 @@ fn unicode_bidi_type(c: u32) -> Option<BidiClass> {

// equivalent to the 'classify' method in ufo2ft:
// <https://github.com/googlefonts/ufo2ft/blob/cea60d71dfcf0b1c0f/Lib/ufo2ft/util.py#L287>
fn classify<T, F, I, CM>(
fn classify<T, F, CM>(
char_map: &CM,
mut props_fn: F,
gsub: Option<&Gsub>,
) -> Result<HashMap<T, HashSet<GlyphId>>, ReadError>
) -> Result<BTreeMap<T, HashSet<GlyphId>>, ReadError>
where
T: Hash + Eq,
I: Iterator<Item = T>,
F: FnMut(u32) -> I,
T: Ord + Eq,
// instead of returning an iterator, pushes items into the provided buffer
F: FnMut(u32, &mut Vec<T>),
CM: CharMap,
{
let mut sets = HashMap::new();
let mut sets = BTreeMap::new();
let mut neutral_glyphs = HashSet::new();
let mut buf = Vec::new();
for (gid, unicode_value) in char_map.iter_glyphs() {
let mut has_props = false;
for prop in props_fn(unicode_value) {
props_fn(unicode_value, &mut buf);
for prop in buf.drain(..) {
sets.entry(prop).or_insert(HashSet::new()).insert(gid);
has_props = true;
}
Expand Down Expand Up @@ -169,23 +155,25 @@ pub(crate) fn scripts_by_glyph(
gsub: Option<&Gsub>,
) -> Result<HashMap<GlyphId, HashSet<UnicodeShortName>>, ReadError> {
let mut result = HashMap::new();
let lookup = Script::enum_to_short_name_mapper();
for (script, glyphs) in classify(
glyphs,
|cp| {
// we need to write this in such a way as to return a single concrete type;
// this is basically two branches, one or the other option is always `None`
let common = known_scripts.is_empty().then_some(COMMON_SCRIPT);
let other_branch = if known_scripts.is_empty() {
None
|cp, buf| {
if known_scripts.is_empty() {
buf.push(COMMON_SCRIPT);
} else {
Some(unicode_script_extensions(cp).filter(|script| {
*script == COMMON_SCRIPT
|| *script == INHERITED_SCRIPT
|| known_scripts.contains(script)
}))
};

common.into_iter().chain(other_branch.into_iter().flatten())
let data =
icu_properties::script::script_with_extensions().get_script_extensions_val(cp);
buf.extend(
data.iter()
.map(|s| lookup.get(s).unwrap())
.filter(|script| {
*script == COMMON_SCRIPT
|| *script == INHERITED_SCRIPT
|| known_scripts.contains(script)
}),
);
}
},
gsub,
)? {
Expand All @@ -200,10 +188,10 @@ pub(crate) fn scripts_by_glyph(
pub(crate) fn glyphs_by_bidi_class(
glyphs: &impl CharMap,
gsub: Option<&Gsub>,
) -> Result<HashMap<BidiClass, HashSet<GlyphId>>, ReadError> {
) -> Result<BTreeMap<BidiClass, HashSet<GlyphId>>, ReadError> {
classify(
glyphs,
|codepoint| unicode_bidi_type(codepoint).into_iter(),
|codepoint, buf| buf.extend(unicode_bidi_type(codepoint)),
gsub,
)
}
Expand Down

0 comments on commit e0b1cb4

Please sign in to comment.