Add automatic checking for profanity

This adds functionality to automatically check for profanity in text messages written in any of the XMPP MUC rooms monitored by the moderation bot. The terms being considered profanity can be configured using the database and are language specific. They have to be stored in their lemmatized form. If a supported language gets detected with an accuracy of 100% only terms for that language will be checked, otherwise English terms will be checked as well. Supported languages for now are English, French, German, Polish, Portuguese, Russian, Spanish and Turkish. For the first two times in a sliding window of three months a user uses profanity they'll receive a warning. Starting from the third time, the user will get muted. At first users will be muted for five minutes, with an exponentially increasing duration up to one week for each continued use of profanity afterwards. To enable this functionality the `--enable-profanity-monitoring` command line option has to be provided.
0ad · Oct 8, 2024 · 1ed99e3 · 1ed99e3
1 parent 64d6cf2
commit 1ed99e3
Show file tree

Hide file tree

Showing 3 changed files with 273 additions and 27 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "cachetools",
     "defusedxml",
     "dateparser",
+    "simplemma[marisa-trie]>=1.1.1",
     "slixmpp>=1.8.0",
     "sqlalchemy>=2.0.4",
 ]
@@ -87,5 +88,5 @@ max-doc-length = 72
 convention = "pep257"
 
 [tool.ruff.lint.pylint]
-max-args = 8
+max-args = 10
 max-nested-blocks = 4
diff --git a/xpartamupp/lobby_moderation_db.py b/xpartamupp/lobby_moderation_db.py
@@ -24,6 +24,7 @@
 from typing import Any, ClassVar
 
 from sqlalchemy import (
+    JSON,
     DateTime,
     ForeignKey,
     String,
@@ -69,20 +70,13 @@ class Base(DeclarativeBase):
     }
 
 
-class Blacklist(Base):
+class ProfanityTerms(Base):
     """Model for profanity terms."""
 
-    __tablename__ = "profanity_blacklist"
+    __tablename__ = "profanity_terms"
 
-    word: Mapped[str] = mapped_column(String(255), primary_key=True)
-
-
-class Whitelist(Base):
-    """Model for terms which are whitelisted from profanity."""
-
-    __tablename__ = "profanity_whitelist"
-
-    word: Mapped[str] = mapped_column(String(255), primary_key=True)
+    term: Mapped[str] = mapped_column(String(255), primary_key=True)
+    language: Mapped[str] = mapped_column(String(2), primary_key=True)
 
 
 class ProfanityIncident(Base):
@@ -91,10 +85,12 @@ class ProfanityIncident(Base):
     __tablename__ = "profanity_incidents"
 
     id: Mapped[int] = mapped_column(primary_key=True)
-    timestamp: Mapped[datetime]
+    timestamp: Mapped[datetime] = mapped_column(default=partial(datetime.now, tz=UTC))
     player: Mapped[str] = mapped_column(String(255))
+    room: Mapped[str] = mapped_column(String(255))
     offending_content: Mapped[str] = mapped_column(UnicodeText)
-    deleted: Mapped[bool]
+    detected_languages: Mapped[list[str]] = mapped_column(JSON)
+    matched_terms: Mapped[list[str]] = mapped_column(JSON)
 
 
 class JIDNickWhitelist(Base):