Skip to content

Commit

Permalink
Further improvements for casefolding
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Oct 20, 2018
1 parent 40cd160 commit d04598c
Show file tree
Hide file tree
Showing 6 changed files with 436 additions and 31 deletions.
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
julia 0.6 2-
ModuleInterfaceTools 0.1.6
ModuleInterfaceTools 0.1.7
StrAPI 0.1.8
CharSetEncodings 0.1.8
321 changes: 321 additions & 0 deletions src/CaseTables.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
__precompile__(true)
"""
Case folding tables for Unicode characters
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
Licensed under MIT License, see LICENSE.md
"""
module CaseTables

struct CaseTable
l_tab::NTuple{128,UInt8}
u_tab::NTuple{128,UInt8}
t_tab::NTuple{64,UInt8}

is_l_tab::NTuple{256, UInt8}
is_u_tab::NTuple{256, UInt8}
can_l_tab::NTuple{256, UInt8}
can_u_tab::NTuple{256, UInt8}

can_l_flg::UInt128
can_u_flg::UInt128
can_t_flg::UInt128
can_sl_flg::UInt128
can_su_flg::UInt128

is_l_flg::UInt128
is_u_flg::UInt128
is_sl_flg::UInt128
is_su_flg::UInt128

siz_l_flg::UInt128
siz_u_flg::UInt128
max_siz_l::UInt32
max_siz_u::UInt32
end

@static VERSION < v"0.7-" && (const islowercase = islower; const isuppercase = isupper)

# Calculate tables (later move these to library built based on Unicode tables with BinaryBuilder,
# loaded by BinaryProvider)

# For speed, we want to know which characters can be lowercased, uppercased, or titlecased
# We want to also know which ones increase or decrease in size in UTF-8 encoding

@inline utf8_len(cp::Unsigned) = ifelse(cp <= 0x7f, 1, ifelse(cp <= 0x7ff, 2, 3 + (cp > 0xffff)))
@inline utf8_len(ch::Char) = utf8_len(ch%UInt32)

const _z4 = 0%UInt32

const zerotup = (_z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4,
_z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4)

function _add_to_bv!(vec, tmp)
tt = tuple(tmp...)
tt == zerotup && return 0x0
fill!(tmp, 0%UInt32)
for (i, t) in enumerate(vec)
(t == tt) && return UInt8(i)
end
push!(vec, tt)
UInt8(length(vec))
end

# case tables should be:
# 128-bit bitmap to say whether a particular range of 512 code points can be folded
# 64-byte ntuple{32,UInt16}
function case_tables()
bitvec = NTuple{16,UInt32}[] # each element represents 512 characters
tupvec = NTuple{32,UInt16}[] # each element represents 32 characters
offvec = NTuple{32,UInt8}[] # each element maps 1024 character range to offsets in tupvec

sizvecl = Pair{UInt16,UInt64}[]
sizvecu = Pair{UInt16,UInt64}[]

# Top level tables
l_tab = fill(0x0, 128)
u_tab = fill(0x0, 128)
t_tab = fill(0x0, 64)

# The elements of these are all offsets into bitvec
is_l_tab = fill(0x0, 256)
is_u_tab = fill(0x0, 256)
can_l_tab = fill(0x0, 256)
can_u_tab = fill(0x0, 256)
is_l_flg = is_u_flg = is_sl_flg = is_su_flg = 0%UInt128

# These tables get reused, stored in offvec
l_off = fill(0x0, 32)
u_off = fill(0x0, 32)
t_off = fill(0x0, 32)

# These tables get reused, stored in tupvec
tmp_l = fill(0x0000, 32)
tmp_u = fill(0x0000, 32)
tmp_t = fill(0x0000, 32)

# These tables get reused, stored in bitvec
bit_is_l = fill(0%UInt32, 16)
bit_is_u = fill(0%UInt32, 16)
bit_can_l = fill(0%UInt32, 16)
bit_can_u = fill(0%UInt32, 16)

# Handle BMP
l_mid = u_mid = t_mid = false
can_l_flg = can_u_flg = can_t_flg = 0%UInt128
siz_l_flg = siz_u_flg = 0%UInt128
max_siz_l = max_siz_u = 0%UInt32
tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32
for rng in (0x0080:0x20:0xd7e0, 0xe000:0x20:0xffe0), base in rng
hipos = (base >>> 10) + 1
hibit = UInt128(1) << (base >>> 9)
midbits = ((base >>> 5) & 0x1f) + 1

# Handle block of 32 characters
l_flg = u_flg = t_flg = false
diff_l = diff_u = 0%UInt64
for off = 0x00:0x1f
cp = UInt16(base + off)
lowbit = UInt32(1) << off
ch = Char(cp)
cl = UInt16(lowercase(ch))
cu = UInt16(uppercase(ch))
ct = UInt16(titlecase(ch))
tmp_l[off+1] = cl
tmp_u[off+1] = cu
tmp_t[off+1] = ct
islowercase(ch) && (tmp_is_l |= lowbit)
isuppercase(ch) && (tmp_is_u |= lowbit)
cp === cl === cu === ct && continue
sizc = utf8_len(ch)
sizu = utf8_len(cu)
sizt = utf8_len(ct)
if cl !== cp
l_flg = true
tmp_can_l |= lowbit
diff = utf8_len(cl) - sizc
-2 <= diff <= 1 || error("Size difference for $cp -> $cl is $diff")
diff == 0 || (diff_l |= (UInt64(diff & 3)<<(off<<1)); max_siz_l = cp)
end
if cu !== cp
u_flg = true
tmp_can_u |= lowbit
diff = sizu - sizc
-2 <= diff <= 1 || error("Size difference for $cp -> $cu is $diff")
diff == 0 || (diff_u |= (UInt64(diff & 3)<<(off<<1)); max_siz_u = cp)
end
if ct !== cu
t_flg = true
sizu == sizt ||
error("Titlecase and Uppercase are not same size in UTF-8: " *
"$cp $cu:$sizu $ct:$sizt")
end
end

bitoff = ((base >>> 5) & 0xf) + 1
bit_is_l[bitoff] = tmp_is_l
bit_is_u[bitoff] = tmp_is_u
bit_can_l[bitoff] = tmp_can_l
bit_can_u[bitoff] = tmp_can_u

tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32

if l_flg
can_l_flg |= hibit
l_mid = true # Have at least one in this set of blocks
push!(tupvec, tuple(tmp_l...))
l_off[midbits] = UInt8(length(tupvec))
end
if u_flg
can_u_flg |= hibit
u_mid = true
push!(tupvec, tuple(tmp_u...))
u_off[midbits] = UInt8(length(tupvec))
end
if t_flg
can_t_flg |= hibit
t_mid = true
push!(tupvec, tuple(tmp_t...))
t_off[midbits] = UInt8(length(tupvec))
else
t_off[midbits] = u_off[midbits]
end

diff_l == 0 || (siz_l_flg |= hibit; push!(sizvecl, base => diff_l))
diff_u == 0 || (siz_u_flg |= hibit; push!(sizvecu, base => diff_u))

if bitoff == 16
# Reset bits
pos = (base >>> 9) + 1
(is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l)) == 0 || (is_l_flg |= hibit)
(is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u)) == 0 || (is_u_flg |= hibit)
(can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l)) == 0 || (can_l_flg |= hibit)
(can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u)) == 0 || (can_u_flg |= hibit)
end

# Check for end of chunk
midbits == 32 || continue

if l_mid
push!(offvec, tuple(l_off...))
l_tab[hipos] = UInt8(length(offvec))
fill!(l_off, 0x0)
l_mid = false
end
if u_mid
push!(offvec, tuple(u_off...))
u_tab[hipos] = UInt8(length(offvec))
fill!(u_off, 0x0)
u_mid = false
end
if t_mid
push!(offvec, tuple(t_off...))
t_tab[hipos] = UInt8(length(offvec))
t_mid = false
else
t_tab[hipos] = u_tab[hipos]
end
end

# Handle SLP
can_sl_flg = can_su_flg = 0%UInt128

for base in 0x0000:0x20:0xffe0
hipos = (base >>> 10) + 65
hibit = (1%UInt128) << ((base >>> 9) & 0x7f)
midbits = ((base >>> 5) & 0x1f) + 1

# Handle block of 32 characters
l_flg = u_flg = false
for off = 0x00:0x1f
cp = UInt32(0x10000 + base + off)
ch = Char(cp)
cl = UInt32(lowercase(ch))
cu = UInt32(uppercase(ch))
ct = UInt32(titlecase(ch))
lowbit = UInt32(1) << off
islowercase(ch) && (tmp_is_l |= lowbit)
isuppercase(ch) && (tmp_is_u |= lowbit)
tmp_l[off+1] = cl%UInt16
tmp_u[off+1] = cu%UInt16
cp === cl === cu === ct && continue
ct == cu || error("titlecase: $cp -> $ct not same as uppercase $cu")
sizc = 4
sizl = utf8_len(cl)
sizu = utf8_len(cu)
sizl == sizu == 4 || error("UTF-8 sizes not 4: $cp: $cl=$sizl, $cu=$sizu")
if cl !== cp
tmp_can_l |= lowbit
l_flg = true
end
if cu !== cp
tmp_can_u |= lowbit
u_flg = true
end
end

if l_flg
l_mid = true
push!(tupvec, tuple(tmp_l...))
l_off[midbits] = UInt8(length(tupvec))
end
if u_flg
u_mid = true
push!(tupvec, tuple(tmp_u...))
u_off[midbits] = UInt8(length(tupvec))
end

bitoff = ((base >>> 5) & 0xf) + 1
bit_is_l[bitoff] = tmp_is_l
bit_is_u[bitoff] = tmp_is_u
bit_can_l[bitoff] = tmp_can_l
bit_can_u[bitoff] = tmp_can_u

tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32

if bitoff == 16
# Reset bits
pos = (base >>> 9) + 129
(is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l)) == 0 || (is_sl_flg |= hibit)
(is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u)) == 0 || (is_su_flg |= hibit)
(can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l)) == 0 || (can_sl_flg |= hibit)
(can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u)) == 0 || (can_su_flg |= hibit)
end

# Check for end of chunk
midbits == 32 || continue

if l_mid
push!(offvec, tuple(l_off...))
l_tab[hipos] = UInt8(length(offvec))
fill!(l_off, 0x0)
l_mid = false
end
if u_mid
push!(offvec, tuple(u_off...))
u_tab[hipos] = UInt8(length(offvec))
fill!(u_off, 0x0)
u_mid = false
end
end

# Check that there are no upper / lower / title case characters above SLP
for cp in 0x20000:0x10ffff
ch = Char(cp)
ch == lowercase(ch) == uppercase(ch) == titlecase(ch) ||
error("$cp has lower/upper/titlecase")
end

(CaseTable(tuple(l_tab...), tuple(u_tab...), tuple(t_tab...),
tuple(is_l_tab...), tuple(is_u_tab...),
tuple(can_l_tab...), tuple(can_u_tab...),
can_l_flg, can_u_flg, can_t_flg, can_sl_flg, can_su_flg,
is_l_flg, is_u_flg, is_sl_flg, is_su_flg,
siz_l_flg, siz_u_flg, max_siz_l, max_siz_u),
tuple(tupvec...), tuple(offvec...), tuple(bitvec...),
tuple(sizvecl...), tuple(sizvecu...))
end

const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()

end # module CaseTables
7 changes: 4 additions & 3 deletions src/ChrBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ using ModuleInterfaceTools
LatinChars, ByteChars, WideChars, AbsChar, bytoff, chroff, chrdiff, utf_trail,
codepoint_cse, codepoint_rng, codepoint_adj, utf8proc_error,
write_utf8, write_utf16, _write_utf8_2, _write_utf8_3, _write_utf8_4, _write_ucs2,
_lowercase_l, _uppercase_l, _lowercase_u, _uppercase_u, _titlecase_u,
_islower_a, _islower_l, _islower_u, _isupper_a, _isupper_l, _isupper_al, _isupper_u,
_can_upper, _can_upper_l
_lowercase_l, _uppercase_l,
_is_lower_a, _is_lower_l, _is_lower_al, _is_lower_ch,
_is_upper_a, _is_upper_l, _is_upper_al, _is_upper_ch

@api develop! _isvalid_chr

include("core.jl")
@static V6_COMPAT && include("compat.jl")
include("CaseTables.jl"); using .CaseTables
include("casefold.jl")
include("io.jl")
include("traits.jl")
Expand Down
Loading

0 comments on commit d04598c

Please sign in to comment.