diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index e90fa21fb2..29a4415d77 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -379,7 +379,7 @@ 4.13.1 2.0.28 - 5.2.3 + 5.2.4-SNAPSHOT 2.3.2 1.1.8 1.19.0 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index 1cc47ba13a..4acdd0be2a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -16,6 +16,7 @@ */ package org.apache.tika.detect.microsoft; +import static org.apache.poi.hssf.model.InternalWorkbook.BOOK; import static org.apache.tika.mime.MediaType.application; import static org.apache.tika.mime.MediaType.image; @@ -286,16 +287,11 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) { if (mediaType != null) { return mediaType; } - - for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) { - if (ucNames.contains(workbookEntryName)) { - MediaType tmp = processCompObjFormatType(root); - if (tmp.equals(MS_GRAPH_CHART)) { - return MS_GRAPH_CHART; - } - return XLS; - } + mediaType = checkXLS(ucNames, root); + if (mediaType != null) { + return mediaType; } + if (ucNames.contains(SW_DOC_CONTENT_MGR) && ucNames.contains(SW_DOC_MGR_TEMP_STORAGE)) { return SLDWORKS; } else if (ucNames.contains(STAR_CALC_DOCUMENT)) { @@ -322,9 +318,6 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) { // Works 7.0 spreadsheet files contain both // we want to avoid classifying this as Excel return XLR; - } else if (ucNames.contains("BOOK")) { - // Excel 95 or older, we won't be able to parse this.... - return XLS; } else if (ucNames.contains(WORD_DOCUMENT)) { return DOC; } else if (ucNames.contains(QUILL)) { @@ -395,6 +388,26 @@ public static MediaType detect(Set anyCaseNames, DirectoryEntry root) { return OLE; } + private static MediaType checkXLS(Set ucNames, DirectoryEntry root) { + for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) { + if (ucNames.contains(workbookEntryName)) { + MediaType tmp = processCompObjFormatType(root); + if (tmp.equals(MS_GRAPH_CHART)) { + return MS_GRAPH_CHART; + } + return XLS; + } + } + if (ucNames.contains(BOOK)) { + MediaType tmp = processCompObjFormatType(root); + if (tmp.equals(MS_GRAPH_CHART)) { + return MS_GRAPH_CHART; + } + return XLS; + } + return null; + } + private static MediaType checkEncrypted(Set ucNames, DirectoryEntry root) { //figure out if encrypted/pw protected first if (ucNames.contains(DATA_SPACES)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index 7a14733e8b..81e088f914 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -159,8 +159,7 @@ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, // Is it an embedded OLE2 document, or an embedded OOXML document? //first try for ooxml - Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") : - (dir.hasEntry("package") ? dir.getEntry("package") : null); + Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") : null; if (ooxml != null) { // It's OOXML (has a ZipFile): @@ -218,16 +217,12 @@ private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rN //TikaCoreProperties.ORIGINAL_RESOURCE_NAME // Grab the contents and process - DocumentEntry contentsEntry; + DocumentEntry contentsEntry = null; try { contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); - } catch (FileNotFoundException fnfe1) { - try { - contentsEntry = (DocumentEntry) dir.getEntry("Contents"); - } catch (FileNotFoundException fnfe2) { - EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata); - return; - } + } catch (FileNotFoundException fnfe) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe, parentMetadata); + return; } int length = contentsEntry.getSize(); DocumentInputStream inp = null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java index 871e37a911..f8e5bf7499 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java @@ -182,7 +182,9 @@ private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { - contentsEntry = (DocumentEntry) root.getEntry("Contents"); + //no contents + EmbeddedDocumentUtil.recordEmbeddedStreamException(ioe, metadata); + return ret; } try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java index 7f38caf76c..ad7f16ed12 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OLE2CasingTest.java @@ -23,7 +23,6 @@ import java.util.Set; import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; @@ -50,7 +49,6 @@ public void testEncrypted() throws Exception { } @Test - @Disabled("until POI can handle case insensitive entry lookups") public void testBasic() throws Exception { List metadataList = getRecursiveMetadata("casing/simple_normal_case.doc"); assertCloseEnough(metadataList, getRecursiveMetadata("casing/simple_lower_case.doc"));