Skip to content

Commit

Permalink
Force utf-8 parsing of html via lxml
Browse files Browse the repository at this point in the history
Before emoji would break html parsing
  • Loading branch information
stefanw committed Aug 8, 2023
1 parent 5b4d723 commit 02c3847
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions froide/helper/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

try:
from lxml import html as html_parser
from lxml.html import HtmlElement
from lxml.html import HtmlElement, HTMLParser
except ImportError:
html_parser = None

Expand Down Expand Up @@ -289,7 +289,8 @@ def convert_html_to_text(html_str: str, ignore_tags: None = None) -> str:
if html_parser is None:
return strip_tags(html_str)

root = html_parser.fromstring(html_str)
parser = HTMLParser(encoding="utf-8")
root = html_parser.fromstring(html_str.encode("utf-8"), parser=parser)
try:
body = root.xpath("./body")[0]
except IndexError:
Expand Down

0 comments on commit 02c3847

Please sign in to comment.