diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso index a41dc5f58c0e..469ca8961f6b 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso @@ -49,7 +49,7 @@ type Excel_Workbook - file: The file to load. - xls_format: Whether to use the old XLS format (default is XLSX). new : File | Temporary_File -> Boolean -> Excel_Workbook - new file:(File | Temporary_File) xls_format=False = + new file:(File | Temporary_File) xls_format:Boolean=False = file_for_errors = if file.is_a Temporary_File then Nothing else file continuation raw_file = @@ -73,7 +73,7 @@ type Excel_Workbook - xls_format: Whether to use the old XLS format (default is XLSX). - file: Optional file reference. from_stream : Input_Stream -> Boolean -> File | Nothing -> Excel_Workbook - from_stream stream xls_format=False file=Nothing = Excel_Reader.handle_bad_format file <| + from_stream stream xls_format:Boolean=False file=Nothing = Excel_Reader.handle_bad_format file <| temp_file = Temporary_File.from_stream_light stream Excel_Workbook.new temp_file xls_format @@ -89,8 +89,8 @@ type Excel_Workbook ## PRIVATE ICON metadata Returns the list of databases (or catalogs) for the connection. - databases : Nothing - databases self = Nothing + databases : Vector (Text | Nothing) + databases self = [Nothing] ## PRIVATE ICON metadata @@ -109,7 +109,7 @@ type Excel_Workbook Arguments: - database: The target file to open as an Excel_Workbook. set_database : Text | File -> Excel_Workbook ! Illegal_Argument - set_database self database = + set_database self database:(Text | File) = if database == self.database then self else file = File.new database if file.exists && file.is_directory.not then Excel_Workbook.new file self.xls_format else @@ -163,7 +163,7 @@ type Excel_Workbook Gets the names of all the named ranges. named_ranges : Vector Text named_ranges self = self.with_java_workbook java_workbook-> - Vector.from_polyglot_array (ExcelReader.readRangeNames java_workbook) + Vector.from_polyglot_array java_workbook.getRangeNames ## PRIVATE ICON metadata diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelConnectionPool.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelConnectionPool.java index d790c570d5e7..92f4db31721a 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelConnectionPool.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelConnectionPool.java @@ -20,6 +20,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.enso.table.excel.xssfreader.XSSFReaderWorkbook; public class ExcelConnectionPool { public static final ExcelConnectionPool INSTANCE = new ExcelConnectionPool(); @@ -64,7 +65,7 @@ public ReadOnlyExcelConnection openReadOnlyConnection(File file, ExcelFileFormat record.refCount = 1; record.file = file; record.format = format; - record.workbook = openWorkbook(file, format, false); + record.reopen(true); records.put(key, record); return new ReadOnlyExcelConnection(this, key, record); } @@ -212,10 +213,10 @@ static class ConnectionRecord { private int refCount; private File file; private ExcelFileFormat format; - private Workbook workbook; + private ExcelWorkbook workbook; private IOException initializationException = null; - T withWorkbook(Function action) throws IOException { + T withWorkbook(Function action) throws IOException { synchronized (this) { return action.apply(accessCurrentWorkbook()); } @@ -238,7 +239,10 @@ void reopen(boolean throwOnFailure) throws IOException { } try { - workbook = openWorkbook(file, format, false); + workbook = + format == ExcelFileFormat.XLSX + ? new XSSFReaderWorkbook(file.getAbsolutePath()) + : ExcelWorkbook.forPOIUserModel(openWorkbook(file, format, false)); } catch (IOException e) { initializationException = e; if (throwOnFailure) { @@ -248,7 +252,7 @@ void reopen(boolean throwOnFailure) throws IOException { } } - private Workbook accessCurrentWorkbook() throws IOException { + private ExcelWorkbook accessCurrentWorkbook() throws IOException { synchronized (this) { if (workbook == null) { if (initializationException != null) { @@ -278,7 +282,7 @@ private static Workbook openWorkbook(File file, ExcelFileFormat format, boolean throw e; } } - case XLSX -> { + case XLSX, XLSX_FALLBACK -> { try { PackageAccess access = writeAccess ? PackageAccess.READ_WRITE : PackageAccess.READ; OPCPackage pkg = OPCPackage.open(file, access); @@ -300,7 +304,7 @@ private static Workbook openWorkbook(File file, ExcelFileFormat format, boolean private static Workbook createEmptyWorkbook(ExcelFileFormat format) { return switch (format) { case XLS -> new HSSFWorkbook(); - case XLSX -> new XSSFWorkbook(); + case XLSX, XLSX_FALLBACK -> new XSSFWorkbook(); }; } diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelFileFormat.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelFileFormat.java index b5681c258e80..811d1d13f344 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelFileFormat.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelFileFormat.java @@ -2,5 +2,6 @@ public enum ExcelFileFormat { XLS, - XLSX + XLSX, + XLSX_FALLBACK } diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelHeaders.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelHeaders.java index 056f62e54137..b7f4ca4cda62 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelHeaders.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelHeaders.java @@ -57,7 +57,7 @@ private static String[] readRowAsHeaders( String[] output = new String[currentEndCol - startCol + 1]; for (int col = startCol; col <= currentEndCol; col++) { - String cellText = row.getFormattedCell(col); + String cellText = row.getCellText(col); String name = cellText.isEmpty() ? "" : deduplicator.makeUnique(cellText); output[col - startCol] = name; diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelRange.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelRange.java index 7c2f85f82d9e..552f3385fd27 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelRange.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelRange.java @@ -197,7 +197,7 @@ public static ExcelRange expandSingleCell(ExcelRange excelRange, ExcelSheet shee Context context = Context.getCurrent(); while (currentRow != null && !currentRow.isEmpty(excelRange.getLeftColumn(), rightColumn)) { - rightColumn = currentRow.findEndRight(rightColumn); + rightColumn = findEndRight(currentRow, rightColumn); bottomRow++; currentRow = sheet.get(bottomRow); @@ -212,6 +212,16 @@ public static ExcelRange expandSingleCell(ExcelRange excelRange, ExcelSheet shee bottomRow - 1); } + private static int findEndRight(ExcelRow row, int start) { + Context context = Context.getCurrent(); + int column = start; + while (!row.isEmpty(column + 1)) { + column++; + context.safepoint(); + } + return column; + } + /** * @param index The index to the next character after the parsed value * @param value Parsed integer value or 0 if not valid diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelRow.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelRow.java index 4713213b33e0..f77c2eae533e 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelRow.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelRow.java @@ -10,114 +10,51 @@ import org.apache.poi.ss.usermodel.ExcelNumberFormat; import org.apache.poi.ss.usermodel.FormulaError; import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; import org.graalvm.polyglot.Context; /** Wrapper class to handle Excel rows. */ -public class ExcelRow { - private static final DataFormatter formatter = new DataFormatter(); - - private final Row row; - private final int firstColumn; - private final int lastColumn; - private final boolean use1904Format; - - public ExcelRow(Row row, boolean use1904Format) { - this.row = row; - this.firstColumn = row.getFirstCellNum() + 1; - this.lastColumn = row.getLastCellNum(); - this.use1904Format = use1904Format; - } +public interface ExcelRow { + /** Gets the initial column index within the row (1-based). */ + int getFirstColumn(); - public int getFirstColumn() { - return firstColumn; - } + /** Gets the final column index within the row (1-based). */ + int getLastColumn(); - public int getLastColumn() { - return lastColumn; - } + /** Gets the cell at the given index within the row (1-based). */ + Object getCellValue(int column); - public Cell get(int column) { - return (column < firstColumn || column > lastColumn) ? null : row.getCell(column - 1); - } + /** Gets the text of a cell at the given index within the row (1-based). */ + String getCellText(int column); - public Object getCellValue(int column) { - Cell cell = get(column); - CellType cellType = getCellType(cell); - switch (cellType) { - case NUMERIC: - double dblValue = cell.getNumericCellValue(); - var nf = ExcelNumberFormat.from(cell, null); - if (nf != null && DateUtil.isADateFormat(nf.getIdx(), nf.getFormat())) { - var temporal = - use1904Format - ? ExcelUtils.fromExcelDateTime1904(dblValue) - : ExcelUtils.fromExcelDateTime(dblValue); - - if (temporal == null) { - return null; - } - - return switch (temporal) { - case LocalDate date -> { - var dateFormat = cell.getCellStyle().getDataFormatString(); - yield (dateFormat.contains("h") || dateFormat.contains("H")) - ? date.atStartOfDay(ZoneId.systemDefault()) - : date; - } - case ZonedDateTime zdt -> { - if (!use1904Format || zdt.getYear() != 1904 || zdt.getDayOfYear() != 1) { - yield temporal; - } - var dateFormat = cell.getCellStyle().getDataFormatString(); - yield (dateFormat.contains("y") - || dateFormat.contains("M") - || dateFormat.contains("d")) - ? zdt - : zdt.toLocalTime(); - } - default -> temporal; - }; - } else { - if (dblValue == (long) dblValue) { - return (long) dblValue; - } else { - return dblValue; - } - } - case STRING: - return cell.getStringCellValue(); - case BOOLEAN: - return cell.getBooleanCellValue(); - default: - return null; - } - } + /** Gets the cell at the given index within the row (1-based). */ + Cell get(int column); - public static CellType getCellType(Cell cell) { - if (cell == null) { - return CellType._NONE; - } + /** Checks if the specified cell is empty. */ + boolean isEmpty(int column); - CellType cellType = cell.getCellType(); - if (cellType == CellType.FORMULA) { - cellType = cell.getCachedFormulaResultType(); - } + /** Checks if the specified set of cells are empty. */ + boolean isEmpty(int start, int end); - return cellType; - } + /** Gets the cells as text. */ + String[] getCellsAsText(int startCol, int endCol); - public boolean isEmpty(int column) { - CellType cellType = getCellType(get(column)); - return (cellType == CellType._NONE) || (cellType == CellType.BLANK); + /** Gets the underlying Apache POI Sheet object. */ + static ExcelRow forPOIUserModel(Sheet sheet, int rowIndex, boolean use1904Format) { + var row = sheet.getRow(rowIndex - 1); + return row == null + ? null + : new ExcelRowFromPOIUserModel( + row, row.getFirstCellNum() + 1, row.getLastCellNum(), use1904Format); } - public boolean isEmpty(int start, int end) { + static boolean isEmptyHelper(ExcelRow row, int start, int end) { Context context = Context.getCurrent(); - int currentEnd = end == -1 ? getLastColumn() : end; - for (int column = Math.max(getFirstColumn(), start); - column <= Math.min(getLastColumn(), currentEnd); + int currentEnd = end == -1 ? row.getLastColumn() : end; + for (int column = Math.max(row.getFirstColumn(), start); + column <= Math.min(row.getLastColumn(), currentEnd); column++) { - if (!isEmpty(column)) { + if (!row.isEmpty(column)) { return false; } @@ -126,63 +63,144 @@ public boolean isEmpty(int start, int end) { return true; } - public int findEndRight(int start) { - Context context = Context.getCurrent(); - int column = start; - while (!isEmpty(column + 1)) { - column++; - context.safepoint(); + record ExcelRowFromPOIUserModel(Row row, int firstColumn, int lastColumn, boolean use1904Format) + implements ExcelRow { + private static final DataFormatter formatter = new DataFormatter(); + + public int getFirstColumn() { + return firstColumn; + } + + public int getLastColumn() { + return lastColumn; } - return column; - } - /** Returns the formatted cell value. */ - public String getFormattedCell(int col) { - var cell = get(col); - if (cell == null) { - return ""; + public Cell get(int column) { + return (column < firstColumn || column > lastColumn) ? null : row.getCell(column - 1); } - var rawCellType = cell.getCellType(); - var cellType = - rawCellType == CellType.FORMULA ? cell.getCachedFormulaResultType() : rawCellType; - - return switch (cellType) { - case ERROR -> - // Want to show the error message rather than empty. - FormulaError.forInt(cell.getErrorCellValue()).getString(); - case NUMERIC -> { - // Special handling for Number or Date cells as want to keep formatting. - var format = ExcelNumberFormat.from(cell, null); - var value = cell.getNumericCellValue(); - yield format == null - ? Double.toString(value) - : formatter.formatRawCellContents(value, format.getIdx(), format.getFormat()); + public Object getCellValue(int column) { + Cell cell = get(column); + CellType cellType = getCellType(cell); + switch (cellType) { + case NUMERIC: + double dblValue = cell.getNumericCellValue(); + var nf = ExcelNumberFormat.from(cell, null); + if (nf != null && DateUtil.isADateFormat(nf.getIdx(), nf.getFormat())) { + var temporal = + use1904Format + ? ExcelUtils.fromExcelDateTime1904(dblValue) + : ExcelUtils.fromExcelDateTime(dblValue); + + if (temporal == null) { + return null; + } + + return switch (temporal) { + case LocalDate date -> { + var dateFormat = cell.getCellStyle().getDataFormatString(); + yield (dateFormat.contains("h") || dateFormat.contains("H")) + ? date.atStartOfDay(ZoneId.systemDefault()) + : date; + } + case ZonedDateTime zdt -> { + if (!use1904Format || zdt.getYear() != 1904 || zdt.getDayOfYear() != 1) { + yield temporal; + } + var dateFormat = cell.getCellStyle().getDataFormatString(); + yield (dateFormat.contains("y") + || dateFormat.contains("M") + || dateFormat.contains("d")) + ? zdt + : zdt.toLocalTime(); + } + default -> temporal; + }; + } else { + if (dblValue == (long) dblValue) { + return (long) dblValue; + } else { + return dblValue; + } + } + case STRING: + return cell.getStringCellValue(); + case BOOLEAN: + return cell.getBooleanCellValue(); + default: + return null; } - default -> { - // Use the default read and then toString. - var value = getCellValue(col); - yield value == null ? "" : value.toString(); + } + + public String getCellText(int column) { + var cell = get(column); + if (cell == null) { + return ""; } - }; - } - public String[] getCellsAsText(int startCol, int endCol) { - Context context = Context.getCurrent(); - int currentEndCol = endCol == -1 ? getLastColumn() : endCol; - - String[] output = new String[currentEndCol - startCol + 1]; - for (int col = startCol; col <= currentEndCol; col++) { - Cell cell = get(col); - CellType type = ExcelRow.getCellType(cell); - if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) { - return null; + var rawCellType = cell.getCellType(); + var cellType = + rawCellType == CellType.FORMULA ? cell.getCachedFormulaResultType() : rawCellType; + + return switch (cellType) { + case ERROR -> + // Want to show the error message rather than empty. + FormulaError.forInt(cell.getErrorCellValue()).getString(); + case NUMERIC -> { + // Special handling for Number or Date cells as want to keep formatting. + var format = ExcelNumberFormat.from(cell, null); + var value = cell.getNumericCellValue(); + yield format == null + ? Double.toString(value) + : formatter.formatRawCellContents(value, format.getIdx(), format.getFormat()); + } + default -> { + // Use the default read and then toString. + var value = getCellValue(column); + yield value == null ? "" : value.toString(); + } + }; + } + + public boolean isEmpty(int column) { + CellType cellType = getCellType(get(column)); + return (cellType == CellType._NONE) || (cellType == CellType.BLANK); + } + + public boolean isEmpty(int start, int end) { + return isEmptyHelper(this, start, end); + } + + public String[] getCellsAsText(int startCol, int endCol) { + Context context = Context.getCurrent(); + int currentEndCol = endCol == -1 ? getLastColumn() : endCol; + + String[] output = new String[currentEndCol - startCol + 1]; + for (int col = startCol; col <= currentEndCol; col++) { + Cell cell = get(col); + CellType type = getCellType(cell); + if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) { + return null; + } + output[col - startCol] = + type == CellType.STRING && cell != null ? cell.getStringCellValue() : ""; + context.safepoint(); } - output[col - startCol] = - type == CellType.STRING && cell != null ? cell.getStringCellValue() : ""; - context.safepoint(); + + return output; } - return output; + private static CellType getCellType(Cell cell) { + if (cell == null) { + return CellType._NONE; + } + + CellType cellType = cell.getCellType(); + if (cellType == CellType.FORMULA) { + cellType = cell.getCachedFormulaResultType(); + } + + return cellType; + } } } diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelSheet.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelSheet.java index 6b2d70245e4c..4d2dd42a2a32 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelSheet.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelSheet.java @@ -1,37 +1,83 @@ package org.enso.table.excel; -import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; /** Wrapper class to handle Excel sheets. */ -public class ExcelSheet { - private final Sheet sheet; - private final int firstRow; - private final int lastRow; - private final boolean use1904Format; - - public ExcelSheet(Workbook workbook, int sheetIndex) { - this.sheet = workbook.getSheetAt(sheetIndex); - this.firstRow = sheet.getFirstRowNum() + 1; - this.lastRow = sheet.getLastRowNum() + 1; - this.use1904Format = ExcelUtils.is1904DateSystem(workbook); - } +public interface ExcelSheet { + /** Gets the index of the sheet within the workbook (0-based). */ + int getSheetIndex(); - public int getLastRow() { - return lastRow; - } + /** Gets the name of the sheet. */ + String getName(); - public int getFirstRow() { - return firstRow; - } + /** Gets the initial row index within the sheet (1-based). */ + int getFirstRow(); + + /** Gets the final row index within the sheet (1-based). */ + int getLastRow(); + + /** + * Gets the row at the given index within the sheet (1-based) + * + * @param row the row index (1-based)/ + * @return the row object or null if the row index is out of range or doesn't exist. + */ + ExcelRow get(int row); + + /** Gets the underlying Apache POI Sheet object - may be null. Provided for Writer use only. */ + Sheet getSheet(); - public ExcelRow get(int row) { - Row underlyingRow = row < firstRow || row > lastRow ? null : sheet.getRow(row - 1); - return underlyingRow == null ? null : new ExcelRow(underlyingRow, use1904Format); + /** Gets the underlying Apache POI Sheet object. */ + static ExcelSheet forPOIUserModel(Workbook workbook, int sheetIndex) { + var sheet = workbook.getSheetAt(sheetIndex); + return new ExcelSheetFromPOIUserModel( + sheet, + sheetIndex, + sheet.getSheetName(), + sheet.getFirstRowNum() + 1, + sheet.getLastRowNum() + 1, + ExcelUtils.is1904DateSystem(workbook)); } - public Sheet getSheet() { - return sheet; + record ExcelSheetFromPOIUserModel( + Sheet sheet, + int sheetIndex, + String sheetName, + int firstRow, + int lastRow, + boolean use1904Format) + implements ExcelSheet { + @Override + public int getSheetIndex() { + return sheetIndex; + } + + @Override + public String getName() { + return sheetName; + } + + @Override + public int getFirstRow() { + return firstRow; + } + + @Override + public int getLastRow() { + return lastRow; + } + + @Override + public ExcelRow get(int row) { + return row < firstRow || row > lastRow + ? null + : ExcelRow.forPOIUserModel(sheet, row, use1904Format); + } + + @Override + public Sheet getSheet() { + return sheet; + } } } diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelUtils.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelUtils.java index f30bbdbde434..851fee7aeb63 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ExcelUtils.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelUtils.java @@ -1,6 +1,10 @@ package org.enso.table.excel; -import java.time.*; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; import java.time.temporal.ChronoUnit; import java.time.temporal.Temporal; import org.apache.poi.ss.usermodel.Workbook; diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ExcelWorkbook.java b/std-bits/table/src/main/java/org/enso/table/excel/ExcelWorkbook.java new file mode 100644 index 000000000000..91b86fd31f7b --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/ExcelWorkbook.java @@ -0,0 +1,123 @@ +package org.enso.table.excel; + +import java.io.IOException; +import org.apache.poi.ss.usermodel.Name; + +/** Represents an Excel workbook. Wraps the underlying Apache POI Workbook object. */ +public interface ExcelWorkbook { + /** + * Get the number of spreadsheets in the workbook + * + * @return the number of sheets + */ + int getNumberOfSheets(); + + /** + * Returns the index of the sheet by its name + * + * @param name the sheet name + * @return index of the sheet (0 based) + */ + int getSheetIndex(String name); + + /** + * Get the sheet name + * + * @param sheet sheet number (0 based) + * @return Sheet name + */ + String getSheetName(int sheet); + + /** + * @return the total number of defined names in this workbook + */ + int getNumberOfNames(); + + /** + * Get all the range names in the workbook + * + * @return an array of range names + */ + String[] getRangeNames(); + + /** + * Get the formula for a named range. + * + * @param name the name of the range. + * @return the formula for the range or null if not found. + */ + String getNameFormula(String name); + + /** + * Get a sheet by its index + * + * @param sheetIndex the index of the sheet (0 based) + * @return the sheet as an ExcelSheet object + * @throws IllegalArgumentException if the sheet index is out of range. + */ + ExcelSheet getSheetAt(int sheetIndex); + + /** + * Close the underlying input resource (File or Stream), from which the Workbook was read. + * + *

Once this has been called, no further operations, updates or reads should be performed on + * the Workbook. + */ + void close() throws IOException; + + /** + * Create an ExcelWorkbook object from an Apache POI Workbook object + * + * @param workbook the Apache POI Workbook object + * @return the ExcelWorkbook object + */ + static ExcelWorkbook forPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook) { + return new ExcelWorkbookFromPOIUserModel(workbook); + } + + // ** Wrap a Workbook object in the interface. */ + record ExcelWorkbookFromPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook) + implements ExcelWorkbook { + @Override + public int getNumberOfSheets() { + return workbook.getNumberOfSheets(); + } + + @Override + public int getSheetIndex(String name) { + return workbook.getSheetIndex(name); + } + + @Override + public String getSheetName(int sheet) { + return workbook.getSheetName(sheet); + } + + @Override + public int getNumberOfNames() { + return workbook.getNumberOfNames(); + } + + @Override + public String[] getRangeNames() { + var names = workbook.getAllNames(); + return names.stream().map(Name::getNameName).toArray(String[]::new); + } + + @Override + public String getNameFormula(String name) { + var namedRange = workbook.getName(name); + return namedRange == null ? null : namedRange.getRefersToFormula(); + } + + @Override + public ExcelSheet getSheetAt(int sheetIndex) { + return ExcelSheet.forPOIUserModel(workbook, sheetIndex); + } + + @Override + public void close() throws IOException { + workbook.close(); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/excel/ReadOnlyExcelConnection.java b/std-bits/table/src/main/java/org/enso/table/excel/ReadOnlyExcelConnection.java index 60bc80e4d73b..3cbac859648a 100644 --- a/std-bits/table/src/main/java/org/enso/table/excel/ReadOnlyExcelConnection.java +++ b/std-bits/table/src/main/java/org/enso/table/excel/ReadOnlyExcelConnection.java @@ -2,7 +2,6 @@ import java.io.IOException; import java.util.function.Function; -import org.apache.poi.ss.usermodel.Workbook; public class ReadOnlyExcelConnection implements AutoCloseable { @@ -28,7 +27,7 @@ public synchronized void close() throws IOException { record = null; } - public synchronized T withWorkbook(Function f) throws IOException { + public synchronized T withWorkbook(Function f) throws IOException { if (record == null) { throw new IllegalStateException("ReadOnlyExcelConnection is being used after it was closed."); } diff --git a/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderFormats.java b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderFormats.java new file mode 100644 index 000000000000..c634162c449f --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderFormats.java @@ -0,0 +1,29 @@ +package org.enso.table.excel.xssfreader; + +import java.util.HashMap; +import java.util.Map; +import org.apache.poi.xssf.model.StylesTable; + +/** Provides the format strings for number formats in an XSSF workbook. */ +public class XSSFReaderFormats { + private final StylesTable stylesTable; + private final Map numberFormats = new HashMap<>(); + + public XSSFReaderFormats(StylesTable stylesTable) { + this.stylesTable = stylesTable; + } + + public String getNumberFormatAt(short styleIdx) { + if (numberFormats.containsKey(styleIdx)) { + return numberFormats.get(styleIdx); + } + + var style = stylesTable.getStyleAt(styleIdx); + var format = style == null ? "General" : style.getDataFormatString(); + if (format == null || format.equals("General")) { + format = ""; + } + numberFormats.put(styleIdx, format); + return format; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderRow.java b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderRow.java new file mode 100644 index 000000000000..a35f0fcc72d3 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderRow.java @@ -0,0 +1,125 @@ +package org.enso.table.excel.xssfreader; + +import java.time.LocalDateTime; +import java.util.SortedMap; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.enso.table.excel.ExcelRow; + +public class XSSFReaderRow implements ExcelRow { + private static final DataFormatter formatter = new DataFormatter(); + private final SortedMap data; + private final boolean use1904Dates; + + public XSSFReaderRow( + SortedMap data, boolean use1904Dates) { + this.data = data; + this.use1904Dates = use1904Dates; + } + + @Override + public int getFirstColumn() { + return data.firstKey(); + } + + @Override + public int getLastColumn() { + return data.lastKey(); + } + + @Override + public Cell get(int column) { + // Not supported as we don't have the underlying Apache POI Cell object. + throw new UnsupportedOperationException("XSSFReader does not support getting the Cell object."); + } + + @Override + public Object getCellValue(int column) { + var cell = data.get((short) column); + if (cell == null) { + return null; + } + + var dataType = cell.dataType(); + return switch (dataType) { + case BLANK -> null; + case BOOL -> cell.getBooleanValue(); + case DATE -> LocalDateTime.parse(cell.strValue()); // Don't believe used by Excel. + case INLINE_STRING, SST_STRING, FORMULA_STRING -> cell.strValue(); + case INTEGER -> cell.getIntegerValue(); + case NUMBER -> { + double dbl = cell.getNumberValue(); + long longVal = (long) dbl; + if (dbl == longVal) { + yield (long) dbl; + } else { + yield dbl; + } + } + case OLE_DATE -> cell.getDateValue(use1904Dates); + case OLE_DATETIME -> cell.getDateTimeValue(use1904Dates); + case ERROR -> null; + }; + } + + @Override + public String getCellText(int column) { + var cell = data.get((short) column); + if (cell == null) { + return ""; + } + + var dataType = cell.dataType(); + return switch (dataType) { + case BLANK -> ""; + case NUMBER, OLE_DATETIME, OLE_DATE, INTEGER -> { + // Special handling for Number or Date cells as want to keep formatting. + var formatText = cell.format(); + if (formatText == null || formatText.isEmpty()) { + yield cell.strValue(); + } + yield formatter.formatRawCellContents(cell.getNumberValue(), -1, formatText, use1904Dates); + } + case BOOL -> cell.getBooleanValue() ? "TRUE" : "FALSE"; + default -> cell.strValue(); + }; + } + + @Override + public boolean isEmpty(int column) { + var cell = data.get((short) column); + return cell == null || cell.strValue().isEmpty(); + } + + @Override + public boolean isEmpty(int start, int end) { + int currentEnd = end == -1 ? getLastColumn() : end; + for (int column = Math.max(getFirstColumn(), start); + column <= Math.min(getLastColumn(), currentEnd); + column++) { + if (!isEmpty(column)) { + return false; + } + } + return true; + } + + @Override + public String[] getCellsAsText(int startCol, int endCol) { + int currentEndCol = endCol == -1 ? getLastColumn() : endCol; + + String[] output = new String[currentEndCol - startCol + 1]; + for (int col = startCol; col <= currentEndCol; col++) { + + var cell = data.get((short) col); + if (cell != null && !cell.dataType().isString()) { + // Short circuit if find not a string cell. + return null; + } + + output[col - startCol] = cell == null ? "" : cell.strValue(); + } + + return output; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheet.java b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheet.java new file mode 100644 index 000000000000..cdb79cbdbd5b --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheet.java @@ -0,0 +1,150 @@ +package org.enso.table.excel.xssfreader; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import javax.xml.parsers.ParserConfigurationException; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.util.XMLHelper; +import org.enso.table.excel.ExcelRow; +import org.enso.table.excel.ExcelSheet; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class XSSFReaderSheet implements ExcelSheet { + private final int sheetIdx; + private final String sheetName; + private final String relId; + private final XSSFReaderWorkbook parent; + + private boolean hasReadSheetData = false; + private String dimensions; + private int firstRow; + private int lastRow; + private Map> rowData; + + public XSSFReaderSheet(int sheetIdx, String sheetName, String relId, XSSFReaderWorkbook parent) { + this.sheetIdx = sheetIdx; + this.sheetName = sheetName; + this.relId = relId; + this.parent = parent; + } + + private synchronized void ensureReadSheetData() { + if (hasReadSheetData) { + return; + } + + try { + var strings = parent.getSharedStrings(); + var styles = parent.getStyles(); + var handler = + new XSSFReaderSheetXMLHandler(styles, strings) { + @Override + protected void onDimensions(String dimension) { + handleOnDimensions(dimension); + } + + @Override + protected void onStartRow(int rowNum) { + handleOnStartRow(rowNum); + } + + @Override + protected void onCell(int rowNumber, short columnNumber, String ref, CellValue value) { + handleOnCell(rowNumber, columnNumber, value); + } + }; + + var xmlReader = XMLHelper.newXMLReader(); + xmlReader.setContentHandler(handler); + + rowData = new HashMap<>(); + + try { + parent.withReader( + reader -> { + try { + var sheet = reader.getSheet(relId); + xmlReader.parse(new InputSource(sheet)); + } catch (SAXException | InvalidFormatException | IOException e) { + throw new RuntimeException(e); + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + + hasReadSheetData = true; + } catch (SAXException | ParserConfigurationException e) { + throw new RuntimeException(e); + } + } + + @Override + public int getSheetIndex() { + return sheetIdx; + } + + @Override + public String getName() { + return sheetName; + } + + public String getDimensions() { + ensureReadSheetData(); + return dimensions; + } + + @Override + public int getFirstRow() { + ensureReadSheetData(); + return firstRow; + } + + @Override + public int getLastRow() { + ensureReadSheetData(); + return lastRow; + } + + @Override + public ExcelRow get(int row) { + ensureReadSheetData(); + + if (!rowData.containsKey(row)) { + return null; + } + + return new XSSFReaderRow(rowData.get(row), parent.use1904Format()); + } + + @Override + public Sheet getSheet() { + // Not supported as we don't have the underlying Apache POI Sheet object. + throw new UnsupportedOperationException( + "XSSFReader does not support getting the Sheet object."); + } + + protected void handleOnDimensions(String dimension) { + dimensions = dimension; + } + + private void handleOnStartRow(int rowNum) { + if (firstRow == 0 || rowNum < firstRow) { + firstRow = rowNum; + } + + if (lastRow == 0 || rowNum > lastRow) { + lastRow = rowNum; + } + } + + private void handleOnCell( + int rowNumber, short columnNumber, XSSFReaderSheetXMLHandler.CellValue value) { + rowData.computeIfAbsent(rowNumber, k -> new TreeMap<>()).put(columnNumber, value); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheetXMLHandler.java b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheetXMLHandler.java new file mode 100644 index 000000000000..f3ea10bcbba3 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderSheetXMLHandler.java @@ -0,0 +1,259 @@ +package org.enso.table.excel.xssfreader; + +import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML; + +import java.time.ZonedDateTime; +import java.time.temporal.Temporal; +import org.apache.poi.ss.usermodel.DateUtil; +import org.apache.poi.xssf.model.SharedStrings; +import org.apache.poi.xssf.usermodel.XSSFRichTextString; +import org.enso.table.excel.ExcelUtils; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; + +/** Based on the XSSFSheetXMLHandler class from Apache POI. */ +/** + * SAX-based Handler to Read Excel XML on top of POI support. Technical specification can be found + * at: + * https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oe376/db9b9b72-b10b-4e7e-844c-09f88c972219 + * https://ecma-international.org/publications-and-standards/standards/ecma-376/ + */ +public class XSSFReaderSheetXMLHandler extends DefaultHandler { + private final XSSFReaderFormats styles; + private final SharedStrings sharedStrings; + + public enum XSSDataType { + BLANK, + BOOL, + DATE, + ERROR, + INLINE_STRING, + SST_STRING, + NUMBER, + INTEGER, + OLE_DATE, + OLE_DATETIME, + FORMULA_STRING; + + public boolean isString() { + return this == INLINE_STRING || this == SST_STRING || this == FORMULA_STRING; + } + } + + // Record if seen a value element + private boolean seenValue; + + // Set when V start element is seen + private boolean vIsOpen; + + // Set when an Inline String "is" is seen + private boolean isIsOpen; + + // The current row being read (or -1 if not in a row) + private int rowNumber = -1; + + // Handle missing rowNumber in the XML (happens in Excel), first row would be row 1. + private int nextRowNumber = 1; + + // The current cell being read (or null if not in a cell) + private String cellRef; + + // Set when cell start element is seen, used when cell close element is seen. + private XSSDataType dataType; + + // Gathers characters as they are seen. + private final StringBuilder value = new StringBuilder(64); + private String numberFormat = null; + + public XSSFReaderSheetXMLHandler(XSSFReaderFormats styles, SharedStrings strings) { + this.styles = styles; + this.sharedStrings = strings; + } + + private boolean isTextTag(String name) { + return "v".equals(name) || "inlineStr".equals(name) || ("t".equals(name) && isIsOpen); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if (uri != null && !NS_SPREADSHEETML.equals(uri)) { + return; + } + + if (isTextTag(localName)) { + seenValue = true; + vIsOpen = true; + if (!isIsOpen) { + value.setLength(0); + } + } else { + switch (localName) { + case "dimension": // Dimensions of sheet + var dimension = attributes.getValue("ref"); + if (dimension != null) { + onDimensions(dimension); + } + break; + case "row": // Row + String rowNumStr = attributes.getValue("r"); + rowNumber = rowNumStr == null ? nextRowNumber : Integer.parseInt(rowNumStr); + onStartRow(rowNumber); + break; + case "c": // Cell + cellRef = attributes.getValue("r"); + seenValue = false; + + String cellType = attributes.getValue("t"); + if (cellType == null) { + cellType = "n"; // Number is default + } + + dataType = + switch (cellType) { + case "b" -> XSSDataType.BOOL; + case "e" -> XSSDataType.ERROR; + case "d" -> XSSDataType.DATE; // Date in ISO 8601 format. + case "inlineStr" -> XSSDataType.INLINE_STRING; + case "s" -> XSSDataType.SST_STRING; + case "str" -> XSSDataType.FORMULA_STRING; // String formula + default -> XSSDataType.NUMBER; + }; + + // Read the format for NUMBER + numberFormat = null; + if (dataType == XSSDataType.NUMBER) { + String cellStyleStr = attributes.getValue("s"); + if (cellStyleStr != null) { + short styleIndex = (short) Integer.parseInt(cellStyleStr); + numberFormat = styles.getNumberFormatAt(styleIndex); + } + } + break; + case "is": // Inline String + isIsOpen = true; + break; + } + } + } + + /** Captures characters if a suitable element is open. */ + @Override + public void characters(char[] ch, int start, int length) { + if (vIsOpen) { + value.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (uri != null && !NS_SPREADSHEETML.equals(uri)) { + return; + } + + if (isTextTag(localName)) { + vIsOpen = false; + } else { + switch (localName) { + case "sheetData" -> onSheetEnd(); + case "row" -> { + nextRowNumber = rowNumber + 1; + rowNumber = -1; + } + case "c" -> outputCellValue(); + case "is" -> isIsOpen = false; + case "v" -> vIsOpen = false; + } + } + } + + public record CellValue(XSSDataType dataType, String strValue, String format) { + public boolean getBooleanValue() { + return strValue.charAt(0) == '1'; + } + + public double getNumberValue() { + return Double.parseDouble(strValue); + } + + public long getIntegerValue() { + return Long.parseLong(strValue); + } + + public Temporal getDateValue(boolean use1904Dates) { + return use1904Dates + ? ExcelUtils.fromExcelDateTime1904(getIntegerValue()) + : ExcelUtils.fromExcelDateTime(getIntegerValue()); + } + + public Temporal getDateTimeValue(boolean use1904Dates) { + if (use1904Dates) { + var datetime = ExcelUtils.fromExcelDateTime1904(getNumberValue()); + if (datetime instanceof ZonedDateTime zdt + && zdt.getYear() == 1904 + && zdt.getDayOfYear() == 1 + && !format.contains("y") + && !format.contains("M") + && !format.contains("d")) { + datetime = zdt.toLocalTime(); + } + return datetime; + } + + return ExcelUtils.fromExcelDateTime(getNumberValue()); + } + } + + public String getStringValue() { + if (dataType == XSSDataType.SST_STRING) { + return getSharedString(value.toString()); + } else if (dataType == XSSDataType.INLINE_STRING) { + return new XSSFRichTextString(value.toString()).toString(); + } + return value.toString(); + } + + private String getSharedString(String value) { + int idx = Integer.parseInt(value); + var ss = sharedStrings.getItemAt(idx); + return ss == null ? null : ss.toString(); + } + + private void outputCellValue() { + short columnNumber = 0; + int i = 0; + char c; + while (i < cellRef.length() && (c = cellRef.charAt(i)) >= 'A' && c <= 'Z') { + columnNumber = (short) (columnNumber * 26 + (c - 'A' + 1)); + i++; + } + + if (!seenValue) { + onCell(rowNumber, columnNumber, cellRef, new CellValue(XSSDataType.BLANK, "", null)); + return; + } + + var stringValue = getStringValue(); + if (dataType == XSSDataType.NUMBER) { + boolean isInteger = !stringValue.contains("."); + boolean isDate = DateUtil.isADateFormat(-1, numberFormat); + if (isInteger && isDate) { + dataType = XSSDataType.OLE_DATE; + } else if (isInteger) { + dataType = XSSDataType.INTEGER; + } else if (isDate) { + dataType = XSSDataType.OLE_DATETIME; + } + } + + var cellValue = new CellValue(dataType, stringValue, numberFormat); + onCell(rowNumber, columnNumber, cellRef, cellValue); + } + + protected void onDimensions(String dimension) {} + + protected void onStartRow(int rowNumber) {} + + protected void onCell(int rowNumber, short columnNumber, String ref, CellValue cellValue) {} + + protected void onSheetEnd() {} +} diff --git a/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderWorkbook.java b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderWorkbook.java new file mode 100644 index 000000000000..6502057ff416 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/excel/xssfreader/XSSFReaderWorkbook.java @@ -0,0 +1,284 @@ +package org.enso.table.excel.xssfreader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import javax.xml.XMLConstants; +import javax.xml.namespace.NamespaceContext; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import org.apache.poi.ooxml.util.DocumentHelper; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackageAccess; +import org.apache.poi.ss.usermodel.RichTextString; +import org.apache.poi.xssf.eventusermodel.XSSFReader; +import org.apache.poi.xssf.model.SharedStrings; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.enso.table.excel.ExcelSheet; +import org.enso.table.excel.ExcelWorkbook; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +public class XSSFReaderWorkbook implements ExcelWorkbook { + private static final XPathFactory xpathFactory = XPathFactory.newInstance(); + private static final NamespaceContext namespaceContext = new SpreadsheetContext(); + private static final Map xpathCache = new HashMap<>(); + + private static XPathExpression compileXPathWithNamespace(String xpath) + throws XPathExpressionException { + if (!xpathCache.containsKey(xpath)) { + var newXPath = xpathFactory.newXPath(); + newXPath.setNamespaceContext(namespaceContext); + var compiled = newXPath.compile(xpath); + xpathCache.put(xpath, compiled); + } + return xpathCache.get(xpath); + } + + private static class SpreadsheetContext implements NamespaceContext { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("prefix cannot be null"); + } + return prefix.equals("ss") ? XSSFRelation.NS_SPREADSHEETML : XMLConstants.NULL_NS_URI; + } + + @Override + public String getPrefix(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML) ? "ss" : null; + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + if (namespaceURI == null) { + throw new IllegalArgumentException("namespaceURI cannot be null"); + } + return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML) + ? Collections.singleton("ss").iterator() + : Arrays.stream(new String[0]).iterator(); + } + } + + public static final String WORKBOOK_CONFIG_XPATH = "/ss:workbook/ss:workbookPr"; + public static final String SHEET_NAME_XPATH = "/ss:workbook/ss:sheets/ss:sheet"; + public static final String NAMED_RANGE_XPATH = "/ss:workbook/ss:definedNames/ss:definedName"; + + private final String path; + + private boolean use1904DateSystemFlag = false; + private List sheetInfos; + private Map sheetInfoMap; + private Map namedRangeMap; + + private boolean hasReadShared = false; + private SharedStrings sharedStrings; + private XSSFReaderFormats styles; + + public XSSFReaderWorkbook(String path) throws IOException { + this.path = path; + + // Read the workbook data + this.readWorkbookData(); + } + + public String getPath() { + return path; + } + + void withReader(Consumer action) throws IOException { + try (var pkg = OPCPackage.open(path, PackageAccess.READ)) { + var reader = new XSSFReader(pkg); + action.accept(reader); + } catch (OpenXML4JException e) { + throw new IOException( + "Invalid format encountered when opening the file " + path + " as XLSX.", e); + } + } + + private record SheetInfo(int index, int sheetId, String name, String relID, boolean visible) {} + + private record NamedRange(String name, String formula) {} + + private void readWorkbookData() throws IOException { + withReader( + reader -> { + try { + var workbookData = reader.getWorkbookData(); + var workbookDoc = DocumentHelper.readDocument(workbookData); + read1904DateSetting(workbookDoc); + readSheetInfo(workbookDoc); + readNamedRanges(workbookDoc); + } catch (SAXException + | IOException + | InvalidFormatException + | XPathExpressionException e) { + throw new RuntimeException(e); + } + }); + } + + private void readNamedRanges(Document workbookDoc) throws XPathExpressionException { + var namesXPath = compileXPathWithNamespace(NAMED_RANGE_XPATH); + var nameNodes = (NodeList) namesXPath.evaluate(workbookDoc, XPathConstants.NODESET); + namedRangeMap = new HashMap<>(); + for (int i = 0; i < nameNodes.getLength(); i++) { + var node = nameNodes.item(i); + var name = node.getAttributes().getNamedItem("name").getNodeValue(); + var formula = node.getTextContent(); + namedRangeMap.put(name, new NamedRange(name, formula)); + } + } + + private void readSheetInfo(Document workbookDoc) throws XPathExpressionException { + var sheetXPath = compileXPathWithNamespace(SHEET_NAME_XPATH); + var sheetNodes = (NodeList) sheetXPath.evaluate(workbookDoc, XPathConstants.NODESET); + sheetInfos = new ArrayList<>(sheetNodes.getLength()); + sheetInfoMap = new HashMap<>(); + for (int i = 0; i < sheetNodes.getLength(); i++) { + var node = sheetNodes.item(i); + var sheetName = node.getAttributes().getNamedItem("name").getNodeValue(); + var sheetId = Integer.parseInt(node.getAttributes().getNamedItem("sheetId").getNodeValue()); + var relId = node.getAttributes().getNamedItem("r:id").getNodeValue(); + var visible = node.getAttributes().getNamedItem("state") == null; + var sheetInfo = new SheetInfo(i, sheetId, sheetName, relId, visible); + sheetInfos.add(sheetInfo); + sheetInfoMap.put(sheetName, sheetInfo); + } + } + + private void read1904DateSetting(Document workbookDoc) throws XPathExpressionException { + var workbookXPath = compileXPathWithNamespace(WORKBOOK_CONFIG_XPATH); + var workbookNode = (Node) workbookXPath.evaluate(workbookDoc, XPathConstants.NODE); + if (workbookNode != null) { + var date1904 = workbookNode.getAttributes().getNamedItem("date1904"); + use1904DateSystemFlag = date1904 != null && "1".equals(date1904.getNodeValue()); + } + } + + private synchronized void ensureReadShared() { + if (hasReadShared) { + return; + } + + try { + withReader( + reader -> { + try { + reader.setUseReadOnlySharedStringsTable(true); + sharedStrings = reader.getSharedStringsTable(); + if (sharedStrings == null) { + sharedStrings = + new SharedStrings() { + @Override + public RichTextString getItemAt(int idx) { + return null; + } + + @Override + public int getCount() { + return 0; + } + + @Override + public int getUniqueCount() { + return 0; + } + }; + } + + // Read the styles table and attach the format data + var stylesTable = reader.getStylesTable(); + styles = new XSSFReaderFormats(stylesTable); + + hasReadShared = true; + } catch (InvalidFormatException | IOException e) { + throw new RuntimeException(e); + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** Flag that workbook is in 1904 format. */ + boolean use1904Format() { + return use1904DateSystemFlag; + } + + @Override + public int getNumberOfSheets() { + return sheetInfoMap.size(); + } + + @Override + public int getSheetIndex(String name) { + if (!sheetInfoMap.containsKey(name)) { + return -1; + } + return sheetInfoMap.get(name).index; + } + + @Override + public String getSheetName(int sheet) { + if (sheet < 0 || sheet >= sheetInfos.size()) { + throw new IllegalArgumentException("Sheet index out of range: " + sheet); + } + return sheetInfos.get(sheet).name; + } + + @Override + public int getNumberOfNames() { + return namedRangeMap.size(); + } + + @Override + public String[] getRangeNames() { + return namedRangeMap.keySet().toArray(String[]::new); + } + + @Override + public String getNameFormula(String name) { + var namedRange = namedRangeMap.get(name); + return namedRange == null ? null : namedRange.formula; + } + + public SharedStrings getSharedStrings() { + ensureReadShared(); + return sharedStrings; + } + + public XSSFReaderFormats getStyles() { + ensureReadShared(); + return styles; + } + + @Override + public ExcelSheet getSheetAt(int sheetIndex) { + if (sheetIndex < 0 || sheetIndex >= sheetInfos.size()) { + throw new IllegalArgumentException("Sheet index out of range: " + sheetIndex); + } + var sheetInfo = sheetInfos.get(sheetIndex); + return new XSSFReaderSheet(sheetIndex, sheetInfo.name, sheetInfo.relID, this); + } + + @Override + public void close() throws IOException { + // Nothing to do + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/ExcelReader.java b/std-bits/table/src/main/java/org/enso/table/read/ExcelReader.java index 1a8e1ab54c60..6f6b289e8998 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/ExcelReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/ExcelReader.java @@ -7,9 +7,6 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; -import org.apache.poi.openxml4j.exceptions.InvalidFormatException; -import org.apache.poi.ss.usermodel.Name; -import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.util.CellReference; import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.InferredBuilder; @@ -24,6 +21,7 @@ import org.enso.table.excel.ExcelRange; import org.enso.table.excel.ExcelRow; import org.enso.table.excel.ExcelSheet; +import org.enso.table.excel.ExcelWorkbook; import org.enso.table.excel.ReadOnlyExcelConnection; import org.enso.table.problems.ProblemAggregator; import org.graalvm.polyglot.Context; @@ -38,18 +36,17 @@ public class ExcelReader { * @return a String[] containing the sheet names. * @throws IOException when the action fails */ - public static String[] readSheetNames(File file, ExcelFileFormat format) - throws IOException, InvalidFormatException { + public static String[] readSheetNames(File file, ExcelFileFormat format) throws IOException { return withWorkbook(file, format, ExcelReader::readSheetNames); } /** * Reads a list of sheet names from a workbook into an array. * - * @param workbook a {@link Workbook} to read the sheet names from. + * @param workbook a {@link ExcelWorkbook} to read the sheet names from. * @return a String[] containing the sheet names. */ - public static String[] readSheetNames(Workbook workbook) { + public static String[] readSheetNames(ExcelWorkbook workbook) { int sheetCount = workbook.getNumberOfSheets(); var output = new String[sheetCount]; Context context = Context.getCurrent(); @@ -68,20 +65,8 @@ public static String[] readSheetNames(Workbook workbook) { * @return a String[] containing the range names. * @throws IOException when the action fails */ - public static String[] readRangeNames(File file, ExcelFileFormat format) - throws IOException, InvalidFormatException { - return withWorkbook(file, format, ExcelReader::readRangeNames); - } - - /** - * Reads a list of range names for the specified XLSX/XLS file into an array. - * - * @param workbook a {@link Workbook} to read the sheet names from. - * @return a String[] containing the range names. - */ - public static String[] readRangeNames(Workbook workbook) { - var names = workbook.getAllNames(); - return names.stream().map(Name::getNameName).toArray(String[]::new); + public static String[] readRangeNames(File file, ExcelFileFormat format) throws IOException { + return withWorkbook(file, format, ExcelWorkbook::getRangeNames); } /** @@ -202,7 +187,7 @@ public static Table readRangeByName( /** * Reads a range by sheet name, named range or address for the workbook into a table. * - * @param workbook a {@link Workbook} to read from. + * @param workbook a {@link ExcelWorkbook} to read from. * @param rangeNameOrAddress sheet name, range name or address to read. * @param headers specifies whether the first row should be used as headers. * @param skip_rows skip rows from the top of the range. @@ -211,7 +196,7 @@ public static Table readRangeByName( * @throws InvalidLocationException when the range name or address is not found. */ public static Table readRangeByName( - Workbook workbook, + ExcelWorkbook workbook, String rangeNameOrAddress, ExcelHeaders.HeaderBehavior headers, int skip_rows, @@ -230,11 +215,10 @@ public static Table readRangeByName( problemAggregator); } - Name name = workbook.getName(rangeNameOrAddress); - ExcelRange excelRange; try { - excelRange = new ExcelRange(name == null ? rangeNameOrAddress : name.getRefersToFormula()); + var formula = workbook.getNameFormula(rangeNameOrAddress); + excelRange = new ExcelRange(formula == null ? rangeNameOrAddress : formula); } catch (IllegalArgumentException e) { throw new InvalidLocationException( rangeNameOrAddress, @@ -271,8 +255,8 @@ public static Table readRange( readRange(workbook, excelRange, headers, skip_rows, row_limit, problemAggregator)); } - private static T withWorkbook(File file, ExcelFileFormat format, Function action) - throws IOException { + private static T withWorkbook( + File file, ExcelFileFormat format, Function action) throws IOException { try (ReadOnlyExcelConnection connection = ExcelConnectionPool.INSTANCE.openReadOnlyConnection(file, format)) { return connection.withWorkbook(action); @@ -280,7 +264,7 @@ private static T withWorkbook(File file, ExcelFileFormat format, Function tsv_file = base_dir / "1.tsv"