Skip to content

Commit

Permalink
remove offensive chars during soft cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ornicar committed Dec 13, 2024
1 parent a13c054 commit 0636f72
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
12 changes: 8 additions & 4 deletions lila/src/main/scala/StringOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ object StringOps:
private def removeChars(str: String, isRemoveable: Int => Boolean): String =
if str.chars.anyMatch(isRemoveable(_)) then str.filterNot(isRemoveable(_)) else str

private def isGarbageChar(c: Int) = c >= '\u0250' && (isInvisibleChar(c) ||
private def isGarbageChar(c: Int) = c >= '\u0250' && (isOffensiveChar(c) || isInvisibleChar(c) ||
// bunch of probably useless blocks https://www.compart.com/en/unicode/block/U+2100
// but keep maths operators cause maths are cool https://www.compart.com/en/unicode/block/U+2200
// and chess symbols https://www.compart.com/en/unicode/block/U+2600
(c >= '\u2100' && c <= '\u21FF') ||
(c >= '\u2300' && c <= '\u2653') ||
(c >= '\u2660' && c <= '\u2C5F') ||
// decorative chars ꧁ ꧂ and svastikas
(c == '\ua9c1' || c == '\ua9c2' || c == '\u534d' || c == '\u5350') ||
// decorative chars ꧁ ꧂
(c == '\ua9c1' || c == '\ua9c2') ||
// pretty quranic chars ۩۞
(c >= '\u06d6' && c <= '\u06ff') ||
// phonetic extensions https://www.compart.com/en/unicode/block/U+1D00
Expand All @@ -56,6 +56,10 @@ object StringOps:
// but allow https://www.compart.com/en/unicode/U+0259
(c >= '\u0250' && c < '\u0259') || (c > '\u0259' && c <= '\u02af'))

private def isOffensiveChar(c: Int) =
// svastikas
c == '\u534d' || c == '\u5350'

private inline def isInvisibleChar(c: Int) = invisibleChars.contains(c.toChar)

private val invisibleChars: Set[Int] =
Expand Down Expand Up @@ -130,7 +134,7 @@ object StringOps:

// for inner text like study chapter names, possibly forum posts and team descriptions
def softCleanUp(str: String) =
removeMultibyteInvisible(removeChars(normalize(str), isInvisibleChar(_))).trim
removeMultibyteInvisible(removeChars(normalize(str), c => isOffensiveChar(c) || isInvisibleChar(c))).trim

object base64:
import java.util.Base64
Expand Down
4 changes: 3 additions & 1 deletion lila/src/test/scala/StringOpsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class StringTest extends munit.FunSuite:
assertEquals(rgc("""ℱ۩۞۩꧁꧂"""), "")
assertEquals(rgc("""ᴀᴛᴏᴍɪᴄ"""), "")
assertEquals(rgc("""af éâòöÌÒÒçÇℱ۩۞۩꧁꧂" صار"""), """af éâòöÌÒÒçÇ" صار""")
assertEquals(rgc("卐卐卐"), "")
i18nValidStrings.foreach: txt =>
assertEquals(rgc(txt), txt)

Expand All @@ -59,8 +60,9 @@ class StringTest extends munit.FunSuite:
test("normalize preserve half point"):
assertEquals(normalize("½"), "½")

test("invisible chars"):
test("soft cleanup"):
val sc = softCleanUp
assertEquals(sc("卐卐卐"), "")
// normal space
assertEquals(sc(" "), "")
assertEquals(sc(" "), "")
Expand Down

0 comments on commit 0636f72

Please sign in to comment.