From abba180841012cfbfb861940ee4ca793d8c25362 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Fri, 22 Mar 2024 02:36:49 +0000 Subject: [PATCH] Add iswpunct(3) breakage workaround for Android MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Android 14.0 with Termux 0.118.0, one regression test failure remains, in src/cmd/ksh93/tests/sh_match.sh: the '$' character does not match the [[:punct:]] character class, which is incorrect. This character class queries the system's iswpunct(3) function via src/lib/libast/regex/regclass.c, even for plain single-byte ASCII characters; it should be equivalent to ispunct(3) in that case. Turns out iswpunct(3) is simply broken on Android. Nine of the expected ASCII characters are not recognised, whereas ispunct(3) is fine. A test program shows: ispunct('!')==16; iswpunct('!')==1 ispunct('"')==16; iswpunct('"')==1 ispunct('#')==16; iswpunct('#')==1 ispunct('$')==16; iswpunct('$')==0 ispunct('%')==16; iswpunct('%')==1 ispunct('&')==16; iswpunct('&')==1 ispunct(''')==16; iswpunct(''')==1 ispunct('(')==16; iswpunct('(')==1 ispunct(')')==16; iswpunct(')')==1 ispunct('*')==16; iswpunct('*')==1 ispunct('+')==16; iswpunct('+')==0 ispunct(',')==16; iswpunct(',')==1 ispunct('-')==16; iswpunct('-')==1 ispunct('.')==16; iswpunct('.')==1 ispunct('/')==16; iswpunct('/')==1 ispunct(':')==16; iswpunct(':')==1 ispunct(';')==16; iswpunct(';')==1 ispunct('<')==16; iswpunct('<')==0 ispunct('=')==16; iswpunct('=')==0 ispunct('>')==16; iswpunct('>')==0 ispunct('?')==16; iswpunct('?')==1 ispunct('@')==16; iswpunct('@')==1 ispunct('[')==16; iswpunct('[')==1 ispunct('\')==16; iswpunct('\')==1 ispunct(']')==16; iswpunct(']')==1 ispunct('^')==16; iswpunct('^')==0 ispunct('_')==16; iswpunct('_')==1 ispunct('`')==16; iswpunct('`')==0 ispunct('{')==16; iswpunct('{')==1 ispunct('|')==16; iswpunct('|')==0 ispunct('}')==16; iswpunct('}')==1 ispunct('~')==16; iswpunct('~')==0 It's broken for multibyte UTF-8 characters as well; at least £ and € are not recognised as punctuation. But there's nothing we can realistically do about that. But we can at least fix the ASCII characters as those are important for portability. --- src/cmd/ksh93/tests/sh_match.sh | 5 ++++- src/lib/libast/features/wchar | 16 ++++++++++++++++ src/lib/libast/regex/regclass.c | 11 +++++++++-- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/cmd/ksh93/tests/sh_match.sh b/src/cmd/ksh93/tests/sh_match.sh index 2ca2be8438a2..a66beaff6637 100755 --- a/src/cmd/ksh93/tests/sh_match.sh +++ b/src/cmd/ksh93/tests/sh_match.sh @@ -78,7 +78,10 @@ [[ $'\cg' =~ [[:cntrl:]] ]] || err_exit 'pattern [[:cntrl:]] broken' [[ \$ =~ [[:graph:]] ]] || err_exit 'pattern [[:graph:]] broken' [[ ' ' =~ [[:graph:]] ]] && err_exit 'pattern [[:graph:]] broken' -[[ \$ =~ [[:punct:]] ]] || err_exit 'pattern [[:punct:]] broken' +for c in '!' '"' '#' '$' '%' '&' \' '(' ')' '*' '+' ',' '-' '.' '/' ':' ';' \ + '<' '=' '>' '?' '@' '[' '\\' ']' '^' '_' '`' '{' '|' '}' '~' +do [[ $c =~ [[:punct:]] ]] || err_exit "pattern [[:punct:]] broken for $c" +done [[ / =~ [[:punct:]] ]] || err_exit 'pattern [[:punct:]] broken' [[ ' ' =~ [[:punct:]] ]] && err_exit 'pattern [[:punct:]] broken' [[ x =~ [[:punct:]] ]] && err_exit 'pattern [[:punct:]] broken' diff --git a/src/lib/libast/features/wchar b/src/lib/libast/features/wchar index 3cd8f83f48c0..c3c55eda0944 100644 --- a/src/lib/libast/features/wchar +++ b/src/lib/libast/features/wchar @@ -56,6 +56,22 @@ if hdr - wctype wchar.h endif endif +tst iswpunct_broken note{ is iswpunct(3) broken }end execute{ + /* on Android 14.0, it is: it does not recognise some of the ASCII characters. ispunct(3) is fine */ + #include + #include + int main(void) + { + char c[] = { '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', + '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\0' }; + int i; + for (i = 0; c[i]; i++) + if (ispunct(c[i]) && !iswpunct(c[i])) + return 0; + return 1; + } +}end + run{ cat <