From 6a9f4ca6930bc4160030c9e21885af476eeeded6 Mon Sep 17 00:00:00 2001 From: PeterAlfredLee Date: Tue, 25 Aug 2020 16:50:37 +0800 Subject: [PATCH] Modify default encoding detector Replace HtmlEncodingDetector to StandardHtmlEncodingDetector Adjust some test case --- .../META-INF/services/org.apache.tika.detect.EncodingDetector | 1 + .../java/org/apache/tika/config/TikaEncodingDetectorTest.java | 3 ++- .../config/TIKA-2273-exclude-encoding-detector-default.xml | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector diff --git a/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector new file mode 100644 index 00000000000..278536c0633 --- /dev/null +++ b/tika-parsers/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector @@ -0,0 +1 @@ +org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector \ No newline at end of file diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 4d5c66b4e96..883fd3e4f42 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -43,6 +43,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.html.HtmlEncodingDetector; +import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector; import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.parser.txt.TXTParser; import org.apache.tika.parser.txt.UniversalEncodingDetector; @@ -56,7 +57,7 @@ public void testDefault() { assertTrue(detector instanceof CompositeEncodingDetector); List detectors = ((CompositeEncodingDetector) detector).getDetectors(); assertEquals(3, detectors.size()); - assertTrue(detectors.get(0) instanceof HtmlEncodingDetector); + assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector); assertTrue(detectors.get(1) instanceof UniversalEncodingDetector); assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector); } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml index 8f4de2258fd..5167c2dc01d 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-integration-tests/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml @@ -19,9 +19,9 @@ - + - +