From 2192f91d1c8c7c03b2a44f18cc7c709baba90a59 Mon Sep 17 00:00:00 2001 From: PeterAlfredLee Date: Tue, 25 Aug 2020 16:48:47 +0800 Subject: [PATCH] Modify Charset Aliases : Stop treat ISO-8859-1 as Windows-1252's alias --- .../parser/html/charsetdetector/CharsetAliases.java | 5 +++-- .../parser/html/StandardHtmlEncodingDetectorTest.java | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tika-parser-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parser-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java index 4d4c7c2b49e..2fe6b1ad73b 100644 --- a/tika-parser-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java +++ b/tika-parser-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java @@ -98,8 +98,9 @@ private static void addAll() { addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250"); addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251"); addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", - "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", - "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"); + "ibm819", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"); + addCharset(charset("ISO-8859-1"), "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", + "iso_8859-1", "iso_8859-1:1987"); addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253"); addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"); diff --git a/tika-parser-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java b/tika-parser-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java index 38d351f7f58..d87b0439bbc 100644 --- a/tika-parser-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java +++ b/tika-parser-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java @@ -177,8 +177,7 @@ public void replacement() throws IOException { @Test public void iso88591() throws IOException { - // In the spec, iso-8859-1 is an alias for WINDOWS-1252 - assertWindows1252(""); + assertCharset("", StandardCharsets.ISO_8859_1); } @Test @@ -294,10 +293,9 @@ public void withCompactComment() throws IOException { @Test public void withCharsetInContentType() throws IOException { metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1"); - // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level - assertWindows1252(""); - assertWindows1252(""); - assertWindows1252(""); + assertCharset("", StandardCharsets.ISO_8859_1); + assertCharset("", StandardCharsets.ISO_8859_1); + assertCharset("", StandardCharsets.ISO_8859_1); // if a BOM is present, it has precedence over transport layer information assertCharset("\ufeff", StandardCharsets.UTF_8); assertCharset("\ufeff", StandardCharsets.UTF_16LE);