Skip to content

Commit

Permalink
fix unicode issue
Browse files Browse the repository at this point in the history
  • Loading branch information
TimWhiting committed Feb 6, 2024
1 parent f48555f commit bc32ca4
Show file tree
Hide file tree
Showing 6 changed files with 332 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
.vs/
.vscode/
support/vscode/koka.language-koka/whatsnew.md
EastAsianWidth.txt
src/Syntax/Lexer.hs.gen
node_modules/
out/
Expand Down
280 changes: 270 additions & 10 deletions lib/std/text/unicode.kk
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,18 @@ pub fun is-combining( c : char ) : bool {
(i >= 0x1AB0 && i <= 0x1AFF) ||
(i >= 0x1DC0 && i <= 0x1DFF) ||
(i >= 0x20D0 && i <= 0x20FF) ||
(i >= 0xFE20 && i <= 0xFE2F))
(i >= 0xFE20 && i <= 0xFE2F) ||
(i >= 0xFE00 && i <= 0xFE0F)) // Added variation selectors
// Should we instead add `zero-widths.force.contains(i)`?
}

// Join combining characters with their base into a grapheme.
fun join-combining( cs : list<char>, comb : list<char> = [], acc : list<grapheme> = []) : list<grapheme> {
match(cs) {
Cons(zwj, cc) | zwj.int == 0x200D -> // Handle zero-width-joiner
match cc
Cons(c, cc') -> cc'.join-combining(Cons(c, Cons(zwj,comb)), acc)
Nil -> cc.join-combining(Cons(zwj, comb), acc)
Cons(c,cc) -> if (c.is-combining)
then cc.join-combining( Cons(c,comb), acc )
else cc.join-combining( [c], consrev(comb,acc) )
Expand Down Expand Up @@ -119,21 +125,275 @@ pub fun string/width( s : string ) : int {
//--------------------------------------------------------------

// These characters are considered wide, i.e. 2 columns wide.
// https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
// See ranges with postfix ;W
//
// Update with `python3 util/update-unicode.py -a`
// TODO: Handle 'unassigned' ranges: (Following is an excerpt from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt)
// - All code points, assigned or unassigned, that are not listed
// explicitly are given the value "N".
// - The unassigned code points in the following blocks default to "W":
// CJK Unified Ideographs Extension A: U+3400..U+4DBF
// CJK Unified Ideographs: U+4E00..U+9FFF
// CJK Compatibility Ideographs: U+F900..U+FAFF
// - All undesignated code points in Planes 2 and 3, whether inside or
// outside of allocated blocks, default to "W":
// Plane 2: U+20000..U+2FFFD
// Plane 3: U+30000..U+3FFFD
val asian-wide : delayed<total,rtree> = delay{
build-rtree([
single(0x1100,0x115F),
single(0x231A,0x231B),
single(0x2329,0x2329),
single(0x232A,0x232A),
single(0x2E80,0x303E),
single(0x3040,0xA4CF),
single(0x23E9,0x23EC),
single(0x23F0,0x23F0),
single(0x23F3,0x23F3),
single(0x25FD,0x25FE),
single(0x2614,0x2615),
single(0x2648,0x2653),
single(0x267F,0x267F),
single(0x2693,0x2693),
single(0x26A1,0x26A1),
single(0x26AA,0x26AB),
single(0x26BD,0x26BE),
single(0x26C4,0x26C5),
single(0x26CE,0x26CE),
single(0x26D4,0x26D4),
single(0x26EA,0x26EA),
single(0x26F2,0x26F3),
single(0x26F5,0x26F5),
single(0x26FA,0x26FA),
single(0x26FD,0x26FD),
single(0x2705,0x2705),
single(0x270A,0x270B),
single(0x2728,0x2728),
single(0x274C,0x274C),
single(0x274E,0x274E),
single(0x2753,0x2755),
single(0x2757,0x2757),
single(0x2795,0x2797),
single(0x27B0,0x27B0),
single(0x27BF,0x27BF),
single(0x2B1B,0x2B1C),
single(0x2B50,0x2B50),
single(0x2B55,0x2B55),
single(0x2E80,0x2E99),
single(0x2E9B,0x2EF3),
single(0x2F00,0x2FD5),
single(0x2FF0,0x2FFF),
single(0x3001,0x3003),
single(0x3004,0x3004),
single(0x3005,0x3005),
single(0x3006,0x3006),
single(0x3007,0x3007),
single(0x3008,0x3008),
single(0x3009,0x3009),
single(0x300A,0x300A),
single(0x300B,0x300B),
single(0x300C,0x300C),
single(0x300D,0x300D),
single(0x300E,0x300E),
single(0x300F,0x300F),
single(0x3010,0x3010),
single(0x3011,0x3011),
single(0x3012,0x3013),
single(0x3014,0x3014),
single(0x3015,0x3015),
single(0x3016,0x3016),
single(0x3017,0x3017),
single(0x3018,0x3018),
single(0x3019,0x3019),
single(0x301A,0x301A),
single(0x301B,0x301B),
single(0x301C,0x301C),
single(0x301D,0x301D),
single(0x301E,0x301F),
single(0x3020,0x3020),
single(0x3021,0x3029),
single(0x302A,0x302D),
single(0x302E,0x302F),
single(0x3030,0x3030),
single(0x3031,0x3035),
single(0x3036,0x3037),
single(0x3038,0x303A),
single(0x303B,0x303B),
single(0x303C,0x303C),
single(0x303D,0x303D),
single(0x303E,0x303E),
single(0x3041,0x3096),
single(0x3099,0x309A),
single(0x309B,0x309C),
single(0x309D,0x309E),
single(0x309F,0x309F),
single(0x30A0,0x30A0),
single(0x30A1,0x30FA),
single(0x30FB,0x30FB),
single(0x30FC,0x30FE),
single(0x30FF,0x30FF),
single(0x3105,0x312F),
single(0x3131,0x318E),
single(0x3190,0x3191),
single(0x3192,0x3195),
single(0x3196,0x319F),
single(0x31A0,0x31BF),
single(0x31C0,0x31E3),
single(0x31EF,0x31EF),
single(0x31F0,0x31FF),
single(0x3200,0x321E),
single(0x3220,0x3229),
single(0x322A,0x3247),
single(0x3250,0x3250),
single(0x3251,0x325F),
single(0x3260,0x327F),
single(0x3280,0x3289),
single(0x328A,0x32B0),
single(0x32B1,0x32BF),
single(0x32C0,0x32FF),
single(0x3300,0x33FF),
single(0x3400,0x4DBF),
single(0x4E00,0x9FFF),
single(0xA000,0xA014),
single(0xA015,0xA015),
single(0xA016,0xA48C),
single(0xA490,0xA4C6),
single(0xA960,0xA97C),
single(0xAC00,0xD7A3),
single(0xF900,0xFAFF),
single(0xFE10,0xFE19),
single(0xFE30,0xFE6F),
single(0xFF00,0xFF60),
single(0xFFE0,0xFFE6),
single(0x20000,0x2FFFD),
single(0x30000,0x3FFFD),
single(0xF900,0xFA6D),
single(0xFA6E,0xFA6F),
single(0xFA70,0xFAD9),
single(0xFADA,0xFAFF),
single(0xFE10,0xFE16),
single(0xFE17,0xFE17),
single(0xFE18,0xFE18),
single(0xFE19,0xFE19),
single(0xFE30,0xFE30),
single(0xFE31,0xFE32),
single(0xFE33,0xFE34),
single(0xFE35,0xFE35),
single(0xFE36,0xFE36),
single(0xFE37,0xFE37),
single(0xFE38,0xFE38),
single(0xFE39,0xFE39),
single(0xFE3A,0xFE3A),
single(0xFE3B,0xFE3B),
single(0xFE3C,0xFE3C),
single(0xFE3D,0xFE3D),
single(0xFE3E,0xFE3E),
single(0xFE3F,0xFE3F),
single(0xFE40,0xFE40),
single(0xFE41,0xFE41),
single(0xFE42,0xFE42),
single(0xFE43,0xFE43),
single(0xFE44,0xFE44),
single(0xFE45,0xFE46),
single(0xFE47,0xFE47),
single(0xFE48,0xFE48),
single(0xFE49,0xFE4C),
single(0xFE4D,0xFE4F),
single(0xFE50,0xFE52),
single(0xFE54,0xFE57),
single(0xFE58,0xFE58),
single(0xFE59,0xFE59),
single(0xFE5A,0xFE5A),
single(0xFE5B,0xFE5B),
single(0xFE5C,0xFE5C),
single(0xFE5D,0xFE5D),
single(0xFE5E,0xFE5E),
single(0xFE5F,0xFE61),
single(0xFE62,0xFE62),
single(0xFE63,0xFE63),
single(0xFE64,0xFE66),
single(0xFE68,0xFE68),
single(0xFE69,0xFE69),
single(0xFE6A,0xFE6B),
single(0x16FE0,0x16FE1),
single(0x16FE2,0x16FE2),
single(0x16FE3,0x16FE3),
single(0x16FE4,0x16FE4),
single(0x16FF0,0x16FF1),
single(0x17000,0x187F7),
single(0x18800,0x18AFF),
single(0x18B00,0x18CD5),
single(0x18D00,0x18D08),
single(0x1AFF0,0x1AFF3),
single(0x1AFF5,0x1AFFB),
single(0x1AFFD,0x1AFFE),
single(0x1B000,0x1B0FF),
single(0x1B100,0x1B122),
single(0x1B132,0x1B132),
single(0x1B150,0x1B152),
single(0x1B155,0x1B155),
single(0x1B164,0x1B167),
single(0x1B170,0x1B2FB),
single(0x1F004,0x1F004),
single(0x1F0CF,0x1F0CF),
single(0x1F18E,0x1F18E),
single(0x1F191,0x1F19A),
single(0x1F200,0x1F202),
single(0x1F210,0x1F23B),
single(0x1F240,0x1F248),
single(0x1F250,0x1F251),
single(0x1F260,0x1F265),
single(0x1F300,0x1F320),
single(0x1F32D,0x1F335),
single(0x1F337,0x1F37C),
single(0x1F37E,0x1F393),
single(0x1F3A0,0x1F3CA),
single(0x1F3CF,0x1F3D3),
single(0x1F3E0,0x1F3F0),
single(0x1F3F4,0x1F3F4),
single(0x1F3F8,0x1F3FA),
single(0x1F3FB,0x1F3FF),
single(0x1F400,0x1F43E),
single(0x1F440,0x1F440),
single(0x1F442,0x1F4FC),
single(0x1F4FF,0x1F53D),
single(0x1F54B,0x1F54E),
single(0x1F550,0x1F567),
single(0x1F57A,0x1F57A),
single(0x1F595,0x1F596),
single(0x1F5A4,0x1F5A4),
single(0x1F5FB,0x1F5FF),
single(0x1F600,0x1F64F),
single(0x1F680,0x1F6C5),
single(0x1F6CC,0x1F6CC),
single(0x1F6D0,0x1F6D2),
single(0x1F6D5,0x1F6D7),
single(0x1F6DC,0x1F6DF),
single(0x1F6EB,0x1F6EC),
single(0x1F6F4,0x1F6FC),
single(0x1F7E0,0x1F7EB),
single(0x1F7F0,0x1F7F0),
single(0x1F90C,0x1F93A),
single(0x1F93C,0x1F945),
single(0x1F947,0x1F9FF),
single(0x1FA70,0x1FA7C),
single(0x1FA80,0x1FA88),
single(0x1FA90,0x1FABD),
single(0x1FABF,0x1FAC5),
single(0x1FACE,0x1FADB),
single(0x1FAE0,0x1FAE8),
single(0x1FAF0,0x1FAF8),
single(0x20000,0x2A6DF),
single(0x2A6E0,0x2A6FF),
single(0x2A700,0x2B739),
single(0x2B73A,0x2B73F),
single(0x2B740,0x2B81D),
single(0x2B81E,0x2B81F),
single(0x2B820,0x2CEA1),
single(0x2CEA2,0x2CEAF),
single(0x2CEB0,0x2EBE0),
single(0x2EBE1,0x2EBEF),
single(0x2EBF0,0x2EE5D),
single(0x2EE5E,0x2F7FF),
single(0x2F800,0x2FA1D),
single(0x2FA1E,0x2FA1F),
single(0x2FA20,0x2FFFD),
single(0x30000,0x3134A),
single(0x3134B,0x3134F),
single(0x31350,0x323AF),
single(0x323B0,0x3FFFD)
])
}

Expand Down
2 changes: 2 additions & 0 deletions package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# - support/vscode/koka.language-koka/package.json
# - whatsnew.md, readme.md

# Also update unicode asian-width list in `std/text/unicode`
# using the output of `stack exec koka -- util/update-unicode.kk -- -a`

name: koka
version: 3.0.5
Expand Down
11 changes: 11 additions & 0 deletions test/lib/unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// https://github.com/koka-lang/koka/issues/457
// https://github.com/koka-lang/koka/issues/458
import std/text/unicode

fun main()
// heart, variation, zero width join, fire
// ['h','i','/u2764','/uFE0F','/u200D','/U01F525']
"hi❤️‍🔥".list.println
"hi❤️‍🔥".graphemes.length.println

println(width("👾"))
3 changes: 3 additions & 0 deletions test/lib/unicode.kk.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
['h','i','/u2764','/uFE0F','/u200D','/U01F525']
3
2
45 changes: 45 additions & 0 deletions util/update-unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import std/os/path
import std/os/dir
import std/os/file
import std/os/process
import std/os/env
import std/os/flags

struct iflags
asian-wide: bool = False

val header = "usage:\n stack exec koka -- -e util/update-unicode [-- [options]]\n\noptions:"

val flag-descs =
fun set-asian-wide( f : iflags, b : bool ) : iflags { f(asian-wide = b) }
[ Flag( "a", ["asian-wide"], Bool(set-asian-wide), "print updated asian wide information" )]

pub fun process-flags() : <ndet,console> maybe<iflags>
val (flags,args,errs) = parse( Iflags(), flag-descs, get-args() )
if errs.is-nil && args.is-nil then Just(flags) else
println( errs.join("\n") ++ "\n" ++ flag-descs.usage(header) )
Nothing

fun main()
val flags = process-flags().unjust
if flags.asian-wide then
val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt" > EastAsianWidth.txt"#)
if "EastAsianWidth.txt".path.is-file then
val lines = "EastAsianWidth.txt".path.read-text-file().split("\n")
with line <- lines.foreach()
val values = line.split(";")
if length(values) >= 2 then
val width = values[1].unjust.split("#")[0].unjust.trim-right().trim-left()
if width == "W" then
val charrange = values[0].unjust.trim-right().split("..")
if length(charrange) == 2 then
println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[1].unjust ++ "),")
elif length(charrange) == 1 then
println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[0].unjust ++ "),")
else
throw("Error unsupported range " ++ charrange.show)
// else
// println("Unrecognized format " ++ line)
else
println("Could not find EastAsianWidth.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt to your current directory")

0 comments on commit bc32ca4

Please sign in to comment.