Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tika-2421 : About the encoding of HTML #338

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ private static void addAll() {
addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
"ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
"l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
"ibm819", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
addCharset(charset("ISO-8859-1"), "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591",
"iso_8859-1", "iso_8859-1:1987");
addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
"iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,7 @@ public void replacement() throws IOException {

@Test
public void iso88591() throws IOException {
// In the spec, iso-8859-1 is an alias for WINDOWS-1252
assertWindows1252("<meta charset='iso-8859-1'>");
assertCharset("<meta charset='iso-8859-1'>", StandardCharsets.ISO_8859_1);
}

@Test
Expand Down Expand Up @@ -294,10 +293,9 @@ public void withCompactComment() throws IOException {
@Test
public void withCharsetInContentType() throws IOException {
metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
// ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
assertWindows1252("");
assertWindows1252("<meta charset='UTF-8'>");
assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
assertCharset("", StandardCharsets.ISO_8859_1);
assertCharset("<meta charset='UTF-8'>", StandardCharsets.ISO_8859_1);
assertCharset("<meta http-equiv='content-type' content='charset=utf-8'>", StandardCharsets.ISO_8859_1);
// if a BOM is present, it has precedence over transport layer information
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
Expand All @@ -56,7 +57,7 @@ public void testDefault() {
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
<!-- Explicitly request default parsers -->
<parsers/>
<encodingDetectors>
<!-- All detectors except HtmlEncodingDetector -->
<!-- All detectors except StandardHtmlEncodingDetector -->
<encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector">
<encodingDetector-exclude class="org.apache.tika.parser.html.HtmlEncodingDetector"/>
<encodingDetector-exclude class="org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector"/>
</encodingDetector>
<!-- One other detector, to check ordering -->
<encodingDetector class="org.apache.tika.detect.NonDetectingEncodingDetector"/>
Expand Down