diff --git a/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
new file mode 100644
index 0000000000..278536c063
--- /dev/null
+++ b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -0,0 +1 @@
+org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
index 4d4c7c2b49..2fe6b1ad73 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
@@ -98,8 +98,9 @@ private static void addAll() {
addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
- "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
- "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
+ "ibm819", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
+ addCharset(charset("ISO-8859-1"), "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591",
+ "iso_8859-1", "iso_8859-1:1987");
addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
"iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
index 38d351f7f5..d87b0439bb 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
@@ -177,8 +177,7 @@ public void replacement() throws IOException {
@Test
public void iso88591() throws IOException {
- // In the spec, iso-8859-1 is an alias for WINDOWS-1252
- assertWindows1252("");
+ assertCharset("", StandardCharsets.ISO_8859_1);
}
@Test
@@ -294,10 +293,9 @@ public void withCompactComment() throws IOException {
@Test
public void withCharsetInContentType() throws IOException {
metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
- // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
- assertWindows1252("");
- assertWindows1252("");
- assertWindows1252("");
+ assertCharset("", StandardCharsets.ISO_8859_1);
+ assertCharset("", StandardCharsets.ISO_8859_1);
+ assertCharset("", StandardCharsets.ISO_8859_1);
// if a BOM is present, it has precedence over transport layer information
assertCharset("\ufeff", StandardCharsets.UTF_8);
assertCharset("\ufeff", StandardCharsets.UTF_16LE);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 4d5c66b4e9..883fd3e4f4 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -43,6 +43,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
@@ -56,7 +57,7 @@ public void testDefault() {
assertTrue(detector instanceof CompositeEncodingDetector);
List detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
- assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
+ assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
index 8f4de2258f..5167c2dc01 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
@@ -19,9 +19,9 @@
-
+
-
+