diff --git a/.gitignore b/.gitignore index 40542b3e0..7ce8273f2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ .vs/ .vscode/ support/vscode/koka.language-koka/whatsnew.md +EastAsianWidth.txt src/Syntax/Lexer.hs.gen node_modules/ out/ diff --git a/lib/std/text/unicode.kk b/lib/std/text/unicode.kk index cf32faa00..ffa64dc30 100644 --- a/lib/std/text/unicode.kk +++ b/lib/std/text/unicode.kk @@ -35,12 +35,18 @@ pub fun is-combining( c : char ) : bool { (i >= 0x1AB0 && i <= 0x1AFF) || (i >= 0x1DC0 && i <= 0x1DFF) || (i >= 0x20D0 && i <= 0x20FF) || - (i >= 0xFE20 && i <= 0xFE2F)) + (i >= 0xFE20 && i <= 0xFE2F) || + (i >= 0xFE00 && i <= 0xFE0F)) // Added variation selectors + // Should we instead add `zero-widths.force.contains(i)`? } // Join combining characters with their base into a grapheme. fun join-combining( cs : list, comb : list = [], acc : list = []) : list { match(cs) { + Cons(zwj, cc) | zwj.int == 0x200D -> // Handle zero-width-joiner + match cc + Cons(c, cc') -> cc'.join-combining(Cons(c, Cons(zwj,comb)), acc) + Nil -> cc.join-combining(Cons(zwj, comb), acc) Cons(c,cc) -> if (c.is-combining) then cc.join-combining( Cons(c,comb), acc ) else cc.join-combining( [c], consrev(comb,acc) ) @@ -119,21 +125,275 @@ pub fun string/width( s : string ) : int { //-------------------------------------------------------------- // These characters are considered wide, i.e. 2 columns wide. +// https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt +// See ranges with postfix ;W +// +// Update with `python3 util/update-unicode.py -a` +// TODO: Handle 'unassigned' ranges: (Following is an excerpt from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt) +// - All code points, assigned or unassigned, that are not listed +// explicitly are given the value "N". +// - The unassigned code points in the following blocks default to "W": +// CJK Unified Ideographs Extension A: U+3400..U+4DBF +// CJK Unified Ideographs: U+4E00..U+9FFF +// CJK Compatibility Ideographs: U+F900..U+FAFF +// - All undesignated code points in Planes 2 and 3, whether inside or +// outside of allocated blocks, default to "W": +// Plane 2: U+20000..U+2FFFD +// Plane 3: U+30000..U+3FFFD val asian-wide : delayed = delay{ build-rtree([ single(0x1100,0x115F), + single(0x231A,0x231B), single(0x2329,0x2329), single(0x232A,0x232A), - single(0x2E80,0x303E), - single(0x3040,0xA4CF), + single(0x23E9,0x23EC), + single(0x23F0,0x23F0), + single(0x23F3,0x23F3), + single(0x25FD,0x25FE), + single(0x2614,0x2615), + single(0x2648,0x2653), + single(0x267F,0x267F), + single(0x2693,0x2693), + single(0x26A1,0x26A1), + single(0x26AA,0x26AB), + single(0x26BD,0x26BE), + single(0x26C4,0x26C5), + single(0x26CE,0x26CE), + single(0x26D4,0x26D4), + single(0x26EA,0x26EA), + single(0x26F2,0x26F3), + single(0x26F5,0x26F5), + single(0x26FA,0x26FA), + single(0x26FD,0x26FD), + single(0x2705,0x2705), + single(0x270A,0x270B), + single(0x2728,0x2728), + single(0x274C,0x274C), + single(0x274E,0x274E), + single(0x2753,0x2755), + single(0x2757,0x2757), + single(0x2795,0x2797), + single(0x27B0,0x27B0), + single(0x27BF,0x27BF), + single(0x2B1B,0x2B1C), + single(0x2B50,0x2B50), + single(0x2B55,0x2B55), + single(0x2E80,0x2E99), + single(0x2E9B,0x2EF3), + single(0x2F00,0x2FD5), + single(0x2FF0,0x2FFF), + single(0x3001,0x3003), + single(0x3004,0x3004), + single(0x3005,0x3005), + single(0x3006,0x3006), + single(0x3007,0x3007), + single(0x3008,0x3008), + single(0x3009,0x3009), + single(0x300A,0x300A), + single(0x300B,0x300B), + single(0x300C,0x300C), + single(0x300D,0x300D), + single(0x300E,0x300E), + single(0x300F,0x300F), + single(0x3010,0x3010), + single(0x3011,0x3011), + single(0x3012,0x3013), + single(0x3014,0x3014), + single(0x3015,0x3015), + single(0x3016,0x3016), + single(0x3017,0x3017), + single(0x3018,0x3018), + single(0x3019,0x3019), + single(0x301A,0x301A), + single(0x301B,0x301B), + single(0x301C,0x301C), + single(0x301D,0x301D), + single(0x301E,0x301F), + single(0x3020,0x3020), + single(0x3021,0x3029), + single(0x302A,0x302D), + single(0x302E,0x302F), + single(0x3030,0x3030), + single(0x3031,0x3035), + single(0x3036,0x3037), + single(0x3038,0x303A), + single(0x303B,0x303B), + single(0x303C,0x303C), + single(0x303D,0x303D), + single(0x303E,0x303E), + single(0x3041,0x3096), + single(0x3099,0x309A), + single(0x309B,0x309C), + single(0x309D,0x309E), + single(0x309F,0x309F), + single(0x30A0,0x30A0), + single(0x30A1,0x30FA), + single(0x30FB,0x30FB), + single(0x30FC,0x30FE), + single(0x30FF,0x30FF), + single(0x3105,0x312F), + single(0x3131,0x318E), + single(0x3190,0x3191), + single(0x3192,0x3195), + single(0x3196,0x319F), + single(0x31A0,0x31BF), + single(0x31C0,0x31E3), + single(0x31EF,0x31EF), + single(0x31F0,0x31FF), + single(0x3200,0x321E), + single(0x3220,0x3229), + single(0x322A,0x3247), + single(0x3250,0x3250), + single(0x3251,0x325F), + single(0x3260,0x327F), + single(0x3280,0x3289), + single(0x328A,0x32B0), + single(0x32B1,0x32BF), + single(0x32C0,0x32FF), + single(0x3300,0x33FF), + single(0x3400,0x4DBF), + single(0x4E00,0x9FFF), + single(0xA000,0xA014), + single(0xA015,0xA015), + single(0xA016,0xA48C), + single(0xA490,0xA4C6), + single(0xA960,0xA97C), single(0xAC00,0xD7A3), - single(0xF900,0xFAFF), - single(0xFE10,0xFE19), - single(0xFE30,0xFE6F), - single(0xFF00,0xFF60), - single(0xFFE0,0xFFE6), - single(0x20000,0x2FFFD), - single(0x30000,0x3FFFD), + single(0xF900,0xFA6D), + single(0xFA6E,0xFA6F), + single(0xFA70,0xFAD9), + single(0xFADA,0xFAFF), + single(0xFE10,0xFE16), + single(0xFE17,0xFE17), + single(0xFE18,0xFE18), + single(0xFE19,0xFE19), + single(0xFE30,0xFE30), + single(0xFE31,0xFE32), + single(0xFE33,0xFE34), + single(0xFE35,0xFE35), + single(0xFE36,0xFE36), + single(0xFE37,0xFE37), + single(0xFE38,0xFE38), + single(0xFE39,0xFE39), + single(0xFE3A,0xFE3A), + single(0xFE3B,0xFE3B), + single(0xFE3C,0xFE3C), + single(0xFE3D,0xFE3D), + single(0xFE3E,0xFE3E), + single(0xFE3F,0xFE3F), + single(0xFE40,0xFE40), + single(0xFE41,0xFE41), + single(0xFE42,0xFE42), + single(0xFE43,0xFE43), + single(0xFE44,0xFE44), + single(0xFE45,0xFE46), + single(0xFE47,0xFE47), + single(0xFE48,0xFE48), + single(0xFE49,0xFE4C), + single(0xFE4D,0xFE4F), + single(0xFE50,0xFE52), + single(0xFE54,0xFE57), + single(0xFE58,0xFE58), + single(0xFE59,0xFE59), + single(0xFE5A,0xFE5A), + single(0xFE5B,0xFE5B), + single(0xFE5C,0xFE5C), + single(0xFE5D,0xFE5D), + single(0xFE5E,0xFE5E), + single(0xFE5F,0xFE61), + single(0xFE62,0xFE62), + single(0xFE63,0xFE63), + single(0xFE64,0xFE66), + single(0xFE68,0xFE68), + single(0xFE69,0xFE69), + single(0xFE6A,0xFE6B), + single(0x16FE0,0x16FE1), + single(0x16FE2,0x16FE2), + single(0x16FE3,0x16FE3), + single(0x16FE4,0x16FE4), + single(0x16FF0,0x16FF1), + single(0x17000,0x187F7), + single(0x18800,0x18AFF), + single(0x18B00,0x18CD5), + single(0x18D00,0x18D08), + single(0x1AFF0,0x1AFF3), + single(0x1AFF5,0x1AFFB), + single(0x1AFFD,0x1AFFE), + single(0x1B000,0x1B0FF), + single(0x1B100,0x1B122), + single(0x1B132,0x1B132), + single(0x1B150,0x1B152), + single(0x1B155,0x1B155), + single(0x1B164,0x1B167), + single(0x1B170,0x1B2FB), + single(0x1F004,0x1F004), + single(0x1F0CF,0x1F0CF), + single(0x1F18E,0x1F18E), + single(0x1F191,0x1F19A), + single(0x1F200,0x1F202), + single(0x1F210,0x1F23B), + single(0x1F240,0x1F248), + single(0x1F250,0x1F251), + single(0x1F260,0x1F265), + single(0x1F300,0x1F320), + single(0x1F32D,0x1F335), + single(0x1F337,0x1F37C), + single(0x1F37E,0x1F393), + single(0x1F3A0,0x1F3CA), + single(0x1F3CF,0x1F3D3), + single(0x1F3E0,0x1F3F0), + single(0x1F3F4,0x1F3F4), + single(0x1F3F8,0x1F3FA), + single(0x1F3FB,0x1F3FF), + single(0x1F400,0x1F43E), + single(0x1F440,0x1F440), + single(0x1F442,0x1F4FC), + single(0x1F4FF,0x1F53D), + single(0x1F54B,0x1F54E), + single(0x1F550,0x1F567), + single(0x1F57A,0x1F57A), + single(0x1F595,0x1F596), + single(0x1F5A4,0x1F5A4), + single(0x1F5FB,0x1F5FF), + single(0x1F600,0x1F64F), + single(0x1F680,0x1F6C5), + single(0x1F6CC,0x1F6CC), + single(0x1F6D0,0x1F6D2), + single(0x1F6D5,0x1F6D7), + single(0x1F6DC,0x1F6DF), + single(0x1F6EB,0x1F6EC), + single(0x1F6F4,0x1F6FC), + single(0x1F7E0,0x1F7EB), + single(0x1F7F0,0x1F7F0), + single(0x1F90C,0x1F93A), + single(0x1F93C,0x1F945), + single(0x1F947,0x1F9FF), + single(0x1FA70,0x1FA7C), + single(0x1FA80,0x1FA88), + single(0x1FA90,0x1FABD), + single(0x1FABF,0x1FAC5), + single(0x1FACE,0x1FADB), + single(0x1FAE0,0x1FAE8), + single(0x1FAF0,0x1FAF8), + single(0x20000,0x2A6DF), + single(0x2A6E0,0x2A6FF), + single(0x2A700,0x2B739), + single(0x2B73A,0x2B73F), + single(0x2B740,0x2B81D), + single(0x2B81E,0x2B81F), + single(0x2B820,0x2CEA1), + single(0x2CEA2,0x2CEAF), + single(0x2CEB0,0x2EBE0), + single(0x2EBE1,0x2EBEF), + single(0x2EBF0,0x2EE5D), + single(0x2EE5E,0x2F7FF), + single(0x2F800,0x2FA1D), + single(0x2FA1E,0x2FA1F), + single(0x2FA20,0x2FFFD), + single(0x30000,0x3134A), + single(0x3134B,0x3134F), + single(0x31350,0x323AF), + single(0x323B0,0x3FFFD) ]) } diff --git a/package.yaml b/package.yaml index d178457eb..ceb09f76d 100644 --- a/package.yaml +++ b/package.yaml @@ -8,6 +8,8 @@ # - support/vscode/koka.language-koka/package.json # - whatsnew.md, readme.md +# Also update unicode asian-width list in `std/text/unicode` +# using the output of `stack exec koka -- util/update-unicode.kk -- -a` name: koka version: 3.0.5 diff --git a/test/lib/unicode.kk b/test/lib/unicode.kk new file mode 100644 index 000000000..9d7466642 --- /dev/null +++ b/test/lib/unicode.kk @@ -0,0 +1,11 @@ +// https://github.com/koka-lang/koka/issues/457 +// https://github.com/koka-lang/koka/issues/458 +import std/text/unicode + +fun main() + // heart, variation, zero width join, fire + // ['h','i','/u2764','/uFE0F','/u200D','/U01F525'] + "hi❤️‍🔥".list.println + "hi❤️‍🔥".graphemes.length.println + + println(width("👾")) diff --git a/test/lib/unicode.kk.out b/test/lib/unicode.kk.out new file mode 100644 index 000000000..5b4849494 --- /dev/null +++ b/test/lib/unicode.kk.out @@ -0,0 +1,3 @@ +['h','i','/u2764','/uFE0F','/u200D','/U01F525'] +3 +2 \ No newline at end of file diff --git a/util/update-unicode.kk b/util/update-unicode.kk new file mode 100644 index 000000000..240a09f6d --- /dev/null +++ b/util/update-unicode.kk @@ -0,0 +1,45 @@ +import std/os/path +import std/os/dir +import std/os/file +import std/os/process +import std/os/env +import std/os/flags + +struct iflags + asian-wide: bool = False + +val header = "usage:\n stack exec koka -- -e util/update-unicode [-- [options]]\n\noptions:" + +val flag-descs = + fun set-asian-wide( f : iflags, b : bool ) : iflags { f(asian-wide = b) } + [ Flag( "a", ["asian-wide"], Bool(set-asian-wide), "print updated asian wide information" )] + +pub fun process-flags() : maybe + val (flags,args,errs) = parse( Iflags(), flag-descs, get-args() ) + if errs.is-nil && args.is-nil then Just(flags) else + println( errs.join("\n") ++ "\n" ++ flag-descs.usage(header) ) + Nothing + +fun main() + val flags = process-flags().unjust + if flags.asian-wide then + val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt" > EastAsianWidth.txt"#) + if "EastAsianWidth.txt".path.is-file then + val lines = "EastAsianWidth.txt".path.read-text-file().split("\n") + with line <- lines.foreach() + val values = line.split(";") + if length(values) >= 2 then + val width = values[1].unjust.split("#")[0].unjust.trim-right().trim-left() + if width == "W" then + val charrange = values[0].unjust.trim-right().split("..") + if length(charrange) == 2 then + println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[1].unjust ++ "),") + elif length(charrange) == 1 then + println(" single(0x" ++ charrange[0].unjust ++ ",0x" ++ charrange[0].unjust ++ "),") + else + throw("Error unsupported range " ++ charrange.show) + // else + // println("Unrecognized format " ++ line) + else + println("Could not find EastAsianWidth.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt to your current directory") + \ No newline at end of file