diff --git a/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector new file mode 100644 index 0000000000..278536c063 --- /dev/null +++ b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector @@ -0,0 +1 @@ +org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector \ No newline at end of file diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java index 4d4c7c2b49..2fe6b1ad73 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java @@ -98,8 +98,9 @@ private static void addAll() { addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250"); addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251"); addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", - "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", - "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"); + "ibm819", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"); + addCharset(charset("ISO-8859-1"), "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", + "iso_8859-1", "iso_8859-1:1987"); addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253"); addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"); diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java index 38d351f7f5..d87b0439bb 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java @@ -177,8 +177,7 @@ public void replacement() throws IOException { @Test public void iso88591() throws IOException { - // In the spec, iso-8859-1 is an alias for WINDOWS-1252 - assertWindows1252(""); + assertCharset("", StandardCharsets.ISO_8859_1); } @Test @@ -294,10 +293,9 @@ public void withCompactComment() throws IOException { @Test public void withCharsetInContentType() throws IOException { metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1"); - // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level - assertWindows1252(""); - assertWindows1252(""); - assertWindows1252(""); + assertCharset("", StandardCharsets.ISO_8859_1); + assertCharset("", StandardCharsets.ISO_8859_1); + assertCharset("", StandardCharsets.ISO_8859_1); // if a BOM is present, it has precedence over transport layer information assertCharset("\ufeff", StandardCharsets.UTF_8); assertCharset("\ufeff", StandardCharsets.UTF_16LE); diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 4d5c66b4e9..883fd3e4f4 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -43,6 +43,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.html.HtmlEncodingDetector; +import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector; import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.parser.txt.TXTParser; import org.apache.tika.parser.txt.UniversalEncodingDetector; @@ -56,7 +57,7 @@ public void testDefault() { assertTrue(detector instanceof CompositeEncodingDetector); List detectors = ((CompositeEncodingDetector) detector).getDetectors(); assertEquals(3, detectors.size()); - assertTrue(detectors.get(0) instanceof HtmlEncodingDetector); + assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector); assertTrue(detectors.get(1) instanceof UniversalEncodingDetector); assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector); } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml index 8f4de2258f..5167c2dc01 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml @@ -19,9 +19,9 @@ - + - +