diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index e90fa21fb2..29a4415d77 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -379,7 +379,7 @@
4.13.1
2.0.28
- 5.2.3
+ 5.2.4-SNAPSHOT
2.3.2
1.1.8
1.19.0
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 1cc47ba13a..4acdd0be2a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.detect.microsoft;
+import static org.apache.poi.hssf.model.InternalWorkbook.BOOK;
import static org.apache.tika.mime.MediaType.application;
import static org.apache.tika.mime.MediaType.image;
@@ -286,16 +287,11 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) {
if (mediaType != null) {
return mediaType;
}
-
- for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
- if (ucNames.contains(workbookEntryName)) {
- MediaType tmp = processCompObjFormatType(root);
- if (tmp.equals(MS_GRAPH_CHART)) {
- return MS_GRAPH_CHART;
- }
- return XLS;
- }
+ mediaType = checkXLS(ucNames, root);
+ if (mediaType != null) {
+ return mediaType;
}
+
if (ucNames.contains(SW_DOC_CONTENT_MGR) && ucNames.contains(SW_DOC_MGR_TEMP_STORAGE)) {
return SLDWORKS;
} else if (ucNames.contains(STAR_CALC_DOCUMENT)) {
@@ -322,9 +318,6 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) {
// Works 7.0 spreadsheet files contain both
// we want to avoid classifying this as Excel
return XLR;
- } else if (ucNames.contains("BOOK")) {
- // Excel 95 or older, we won't be able to parse this....
- return XLS;
} else if (ucNames.contains(WORD_DOCUMENT)) {
return DOC;
} else if (ucNames.contains(QUILL)) {
@@ -395,6 +388,26 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) {
return OLE;
}
+ private static MediaType checkXLS(Set ucNames, DirectoryEntry root) {
+ for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
+ if (ucNames.contains(workbookEntryName)) {
+ MediaType tmp = processCompObjFormatType(root);
+ if (tmp.equals(MS_GRAPH_CHART)) {
+ return MS_GRAPH_CHART;
+ }
+ return XLS;
+ }
+ }
+ if (ucNames.contains(BOOK)) {
+ MediaType tmp = processCompObjFormatType(root);
+ if (tmp.equals(MS_GRAPH_CHART)) {
+ return MS_GRAPH_CHART;
+ }
+ return XLS;
+ }
+ return null;
+ }
+
private static MediaType checkEncrypted(Set ucNames, DirectoryEntry root) {
//figure out if encrypted/pw protected first
if (ucNames.contains(DATA_SPACES)) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 7a14733e8b..81e088f914 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -159,8 +159,7 @@ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
// Is it an embedded OLE2 document, or an embedded OOXML document?
//first try for ooxml
- Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") :
- (dir.hasEntry("package") ? dir.getEntry("package") : null);
+ Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") : null;
if (ooxml != null) {
// It's OOXML (has a ZipFile):
@@ -218,16 +217,12 @@ private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rN
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
// Grab the contents and process
- DocumentEntry contentsEntry;
+ DocumentEntry contentsEntry = null;
try {
contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
- } catch (FileNotFoundException fnfe1) {
- try {
- contentsEntry = (DocumentEntry) dir.getEntry("Contents");
- } catch (FileNotFoundException fnfe2) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
- return;
- }
+ } catch (FileNotFoundException fnfe) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata);
+ return;
}
int length = contentsEntry.getSize();
DocumentInputStream inp = null;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
index 871e37a911..f8e5bf7499 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
@@ -182,7 +182,9 @@ private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata,
try {
contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
- contentsEntry = (DocumentEntry) root.getEntry("Contents");
+ //no contents
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(ioe, metadata);
+ return ret;
}
try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java
index 7f38caf76c..ad7f16ed12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java
@@ -23,7 +23,6 @@
import java.util.Set;
import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
@@ -50,7 +49,6 @@ public void testEncrypted() throws Exception {
}
@Test
- @Disabled("until POI can handle case insensitive entry lookups")
public void testBasic() throws Exception {
List metadataList = getRecursiveMetadata("casing/simple_normal_case.doc");
assertCloseEnough(metadataList, getRecursiveMetadata("casing/simple_lower_case.doc"));