From e1efc19dd77b724f329c39f6dcff0ad43a2e4a1f Mon Sep 17 00:00:00 2001 From: Tyler Gregg Date: Wed, 29 Nov 2023 11:07:11 -0800 Subject: [PATCH] Optimizes parsing of binary Ion 1.1 numeric types. --- .../com/amazon/ion/impl/IonCursorBinary.java | 12 +- .../impl/IonReaderContinuableCoreBinary.java | 144 +++++++++++------- ...onReaderContinuableTopLevelBinaryTest.java | 30 ++++ 3 files changed, 129 insertions(+), 57 deletions(-) diff --git a/src/main/java/com/amazon/ion/impl/IonCursorBinary.java b/src/main/java/com/amazon/ion/impl/IonCursorBinary.java index 8dad06f48..3d9f8c0d3 100644 --- a/src/main/java/com/amazon/ion/impl/IonCursorBinary.java +++ b/src/main/java/com/amazon/ion/impl/IonCursorBinary.java @@ -966,8 +966,11 @@ private long uncheckedReadFlexUInt_1_1() { // Single-byte. return currentByte >>> 1; } - if (currentByte == 0) { // The first byte is 0, so there are at least 9 bytes. - throw new IonException("Found a VarUInt that was too large to fit in a `long`"); + if (currentByte == 0) { + // Note: this is conservative, as 9-byte FlexUInts (with a continuation bit in the second byte) can fit + // in a long. However, the FlexUInt parsing methods in this class are only used to calculate value length, + // and the added complexity is not warranted to increase the maximum value size above 2^56 - 1 (72 PB). + throw new IonException("Found a FlexUInt that was too large to fit in a `long`"); } // TODO perf: try putting the rest in its own method byte length = (byte) (Integer.numberOfTrailingZeros(currentByte) + 1); @@ -989,7 +992,10 @@ private long slowReadFlexUInt_1_1() { return -1; } if (currentByte == 0) { - throw new IonException("Found a VarUInt that was too large to fit in a `long`"); + // Note: this is conservative, as 9-byte FlexUInts (with a continuation bit in the second byte) can fit + // in a long. However, the FlexUInt parsing methods in this class are only used to calculate value length, + // and the added complexity is not warranted to increase the maximum value size above 2^56 - 1 (72 PB). + throw new IonException("Found a FlexUInt that was too large to fit in a `long`"); } byte length = (byte) (Integer.numberOfTrailingZeros(currentByte) + 1); long result = currentByte >>> length; diff --git a/src/main/java/com/amazon/ion/impl/IonReaderContinuableCoreBinary.java b/src/main/java/com/amazon/ion/impl/IonReaderContinuableCoreBinary.java index 3228ec355..9c3659e15 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderContinuableCoreBinary.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderContinuableCoreBinary.java @@ -436,14 +436,20 @@ private boolean classifyInteger_1_0() { } /** - * Reads a FlexUInt into a long. After this method returns, `peekIndex` points to the first byte after the end of - * the FlexUInt. + * Reads a 3+ byte FlexUInt into a long. After this method returns, `peekIndex` points to the first byte after the + * end of the FlexUInt. + * @param firstByte the first byte of the FlexUInt. * @return the value. */ - long readFlexUInt_1_1() { - int currentByte = buffer[(int)(peekIndex++)] & SINGLE_BYTE_MASK; - byte length = (byte) (Integer.numberOfTrailingZeros(currentByte) + 1); - long result = currentByte >>> length; + private long readLargeFlexUInt_1_1(int firstByte) { + if (firstByte == 0) { + // Note: this is conservative, as 9-byte flex subfields (with a continuation bit in the second byte) can fit + // in a long. However, the flex subfields parsed by the methods in this class are used only in cases that + // require an int anyway (symbol IDs, decimal scale), so the added complexity is not warranted. + throw new IonException("Flex subfield exceeds the length of a long."); + } + byte length = (byte) (Integer.numberOfTrailingZeros(firstByte) + 1); + long result = firstByte >>> length; for (byte i = 1; i < length; i++) { result |= ((long) (buffer[(int) (peekIndex++)] & SINGLE_BYTE_MASK) << (8 * i - length)); } @@ -451,17 +457,36 @@ long readFlexUInt_1_1() { } /** - * Reads a FlexInt into a long. After this method returns, `peekIndex` points to the first byte after the end of - * the FlexInt. + * Reads a FlexUInt into a long. After this method returns, `peekIndex` points to the first byte after the end of + * the FlexUInt. * @return the value. */ - long readFlexInt_1_1() { + long readFlexUInt_1_1() { + // Up-cast to int, ensuring the most significant bit in the byte is not treated as the sign. int currentByte = buffer[(int)(peekIndex++)] & SINGLE_BYTE_MASK; - byte length = (byte) (Integer.numberOfTrailingZeros(currentByte) + 1); - long result = currentByte >>> length; - for (byte i = 1; i < length; i++) { - result |= ((long) (buffer[(int) (peekIndex++)] & SINGLE_BYTE_MASK) << (8 * i - length)); + if ((currentByte & 1) == 1) { + // Single byte; shift out the continuation bit. + return currentByte >>> 1; } + if ((currentByte & 2) != 0) { + // Two bytes; upcast the second byte to int, ensuring the most significant bit is not treated as the sign. + // Make room for the six value bits in the first byte. Or with those six value bits after shifting out the + // two continuation bits. + return ((buffer[(int) peekIndex++] & SINGLE_BYTE_MASK) << 6 ) | (currentByte >>> 2); + } + return readLargeFlexUInt_1_1(currentByte); + } + + /** + * Reads a 3+ byte FlexInt into a long. After this method returns, `peekIndex` points to the first byte after the + * end of the FlexInt. + * @return the value. + */ + long readLargeFlexInt_1_1(int firstByte) { + firstByte &= SINGLE_BYTE_MASK; + // FlexInts are essentially just FlexUInts that interpret the most significant bit as a sign that needs to be + // extended. + long result = readLargeFlexUInt_1_1(firstByte); if (buffer[(int) peekIndex - 1] < 0) { // Sign extension. result |= ~(-1 >>> Long.numberOfLeadingZeros(result)); @@ -469,6 +494,26 @@ long readFlexInt_1_1() { return result; } + /** + * Reads a FlexInt into a long. After this method returns, `peekIndex` points to the first byte after the end of + * the FlexInt. + * @return the value. + */ + long readFlexInt_1_1() { + // The following up-cast to int performs sign extension, if applicable. + int currentByte = buffer[(int)(peekIndex++)]; + if ((currentByte & 1) == 1) { + // Single byte; shift out the continuation bit while preserving the sign. + return currentByte >> 1; + } + if ((currentByte & 2) != 0) { + // Two bytes; up-cast the second byte to int, thereby performing sign extension. Make room for the six + // value bits in the first byte. Or with those six value bits after shifting out the two continuation bits. + return buffer[(int) peekIndex++] << 6 | ((currentByte & SINGLE_BYTE_MASK) >>> 2); + } + return readLargeFlexInt_1_1(currentByte); + } + private int readVarSym_1_1(Marker marker) { throw new UnsupportedOperationException(); } @@ -494,40 +539,38 @@ private long readFixedInt_1_1() { } /** - * Copies a FixedInt or FixedUInt into scratch space, converting it to its equivalent big-endian two's complement - * representation. If the provided length is longer than the actual length of the value, the most significant - * byte in the two's complement representation will be zero. - * @param startIndex the index of the second byte in the FixedInt or FixedUInt representation. + * Reads a FixedInt or FixedUInt as a BigInteger, first copying the value into scratch space and converting it to + * its equivalent big-endian two's complement representation. If the provided length is longer than the actual + * length of the value, the most significant byte in the two's complement representation will be zero, maintaining + * a positive sign. * @param length the number of bytes remaining in the FixedInt or FixedUInt representation. - * @return a byte[] (either new or reused) containing the big-endian two's complement representation of the value. + * @return a new BigInteger that represents the value. */ - private byte[] copyFixedIntOrFixedUIntAsTwosComplementBytes(long startIndex, int length) { + private BigInteger readLargeFixedIntOrFixedUIntAsBigInteger(int length) { // FixedInt is a little-endian two's complement representation. Simply reverse the bytes. byte[] bytes = getScratchForSize(length); // Clear the most significant byte in case the scratch space is padded to accommodate an unsigned value with // its highest bit set. bytes[0] = 0; int copyIndex = bytes.length; - for (long i = startIndex; i < valueMarker.endIndex; i++) { + for (long i = peekIndex; i < valueMarker.endIndex; i++) { bytes[--copyIndex] = buffer[(int) i]; } peekIndex = valueMarker.endIndex; - return bytes; + return new BigInteger(bytes); } /** - * Reads a FixedInt or FixedUInt value into a BigInteger. - * @param length the length of the two's complement representation of the value. For FixedInts, this is always - * equal to the length of the value; for FixedUInts, this is one byte larger than the length of the - * value if the highest bit in the unsigned representation is set. + * Reads a FixedInt value into a BigInteger. * @return the value. */ - private BigInteger readFixedIntOrFixedUIntAsBigInteger_1_1(int length) { + private BigInteger readFixedIntAsBigInteger_1_1() { BigInteger value; - if (length > 0) { - value = new BigInteger(copyFixedIntOrFixedUIntAsTwosComplementBytes(peekIndex, length)); + int length = (int) (valueMarker.endIndex - peekIndex); + if (length <= LONG_SIZE_IN_BYTES) { + value = BigInteger.valueOf(readFixedInt_1_1()); } else { - value = BigInteger.ZERO; + value = readLargeFixedIntOrFixedUIntAsBigInteger(length); } return value; } @@ -545,7 +588,7 @@ private BigDecimal readBigDecimal_1_1() { value = BigDecimal.valueOf(readFixedInt_1_1(), scale); } else { // The coefficient may overflow a long, so a BigInteger is required. - value = new BigDecimal(readFixedIntOrFixedUIntAsBigInteger_1_1(length), scale); + value = new BigDecimal(readLargeFixedIntOrFixedUIntAsBigInteger(length), scale); } return value; } @@ -557,14 +600,13 @@ private BigDecimal readBigDecimal_1_1() { private Decimal readDecimal_1_1() { int scale = (int) -readFlexInt_1_1(); BigInteger coefficient; - int length = (int) (valueMarker.endIndex - peekIndex); - if (length > 0) { - // NOTE: there is a BigInteger.valueOf(long unscaledValue, int scale) factory method that avoids allocating + if (valueMarker.endIndex > peekIndex) { + // NOTE: there is a BigDecimal.valueOf(long unscaledValue, int scale) factory method that avoids allocating // a BigInteger for coefficients that fit in a long. See its use in readBigDecimal() above. Unfortunately, // it is not possible to use this for Decimal because the necessary BigDecimal constructor is // package-private. If a compatible BigDecimal constructor is added in a future JDK revision, a // corresponding factory method should be added to Decimal to enable this optimization. - coefficient = readFixedIntOrFixedUIntAsBigInteger_1_1(length); + coefficient = readFixedIntAsBigInteger_1_1(); if (coefficient.signum() == 0) { return Decimal.negativeZero(scale); } @@ -590,7 +632,7 @@ private long readLong_1_1() { */ private BigInteger readBigInteger_1_1() { peekIndex = valueMarker.startIndex; - return readFixedIntOrFixedUIntAsBigInteger_1_1((int) (valueMarker.endIndex - peekIndex)); + return readFixedIntAsBigInteger_1_1(); } /** @@ -607,21 +649,17 @@ private BigDecimal readTimestampFraction_1_1() { BigDecimal value; peekIndex = valueMarker.startIndex + L_TIMESTAMP_SECOND_BYTE_LENGTH; int scale = (int) readFlexUInt_1_1(); - if (peekIndex >= valueMarker.endIndex) { - return BigDecimal.valueOf(0, scale); - } int length = (int) (valueMarker.endIndex - peekIndex); - // Since the coefficient is stored in a FixedUInt, some 8-byte values cannot fit in a signed 8-byte long. - // Take the quick path for values up to 7 bytes rather than performing additional checks. This should cover - // almost all real-world timestamp fractions. - if (length <= 7) { - // No need to allocate a BigInteger to hold the coefficient. - value = BigDecimal.valueOf(readFixedUInt_1_1(peekIndex, valueMarker.endIndex), scale); + if (length >= LONG_SIZE_IN_BYTES) { + if (buffer[(int) valueMarker.endIndex - 1] < 0) { + // The most-significant bit is set; pad the length by one byte so that the value remains unsigned. + length += 1; + } + value = new BigDecimal(readLargeFixedIntOrFixedUIntAsBigInteger(length), scale); + } else if (length > 0) { + value = BigDecimal.valueOf(readFixedUInt_1_1(), scale); } else { - // The coefficient may overflow a long, so a BigInteger is required. - // If the most-significant bit is set, pad the length by one byte so that the value remains unsigned. - length += (buffer[(int) valueMarker.endIndex - 1] < 0) ? 1 : 0; - value = new BigDecimal(readFixedIntOrFixedUIntAsBigInteger_1_1(length), scale); + value = BigDecimal.valueOf(0, scale); } if (BigDecimal.ONE.compareTo(value) < 1) { throw new IllegalArgumentException(String.format("Fractional seconds %s must be greater than or equal to 0 and less than 1", value)); @@ -851,15 +889,13 @@ private long readUInt(long startIndex, long endIndex) { } /** - * Reads a FixedUInt (little-endian). - * @param startIndex the index of the first byte in the FixedUInt value. - * @param endIndex the index of the first byte after the end of the FixedUInt value. + * Reads a FixedUInt (little-endian), starting at `peekIndex` and ending at `valueMarker.endIndex`. * @return the value. */ - private long readFixedUInt_1_1(long startIndex, long endIndex) { + private long readFixedUInt_1_1() { long result = 0; - for (int i = (int) startIndex; i < endIndex; i++) { - result |= ((long) (buffer[i] & SINGLE_BYTE_MASK) << ((i - startIndex) * VALUE_BITS_PER_UINT_BYTE)); + for (int i = (int) peekIndex; i < valueMarker.endIndex; i++) { + result |= ((long) (buffer[i] & SINGLE_BYTE_MASK) << ((i - peekIndex) * VALUE_BITS_PER_UINT_BYTE)); } return result; } diff --git a/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java b/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java index 7b4ac7bf7..d88262c6a 100644 --- a/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java +++ b/src/test/java/com/amazon/ion/impl/IonReaderContinuableTopLevelBinaryTest.java @@ -4621,6 +4621,15 @@ public void readTimestampValueWithKnownOffsetShortForm(@ConvertWith(StringToTime "1947-12-23T23:59:00.9999999Z, 11110111 00011001 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00001111 01111111 10010110 10011000 00000000", "1947-12-23T23:59:00.99999999Z, 11110111 00011001 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00010001 11111111 11100000 11110101 00000101", + "1947-12-23T23:59:00.36028797018963968Z, " + // 7-byte coefficient, most-significant bit set + "11110111 00011111 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00100011 00000000 00000000 00000000 00000000 00000000 00000000 10000000", + + "1947-12-23T23:59:00.36028797018963967Z, " + // 7-byte coefficient, most-significant bit set + "11110111 00011111 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00100011 11111111 11111111 11111111 11111111 11111111 11111111 01111111", + + "1947-12-23T23:59:00.72057594037927935Z, " + // 7-byte coefficient, all bits set + "11110111 00011111 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00100011 11111111 11111111 11111111 11111111 11111111 11111111 11111111", + "1947-12-23T23:59:00.9223372036854775807Z, " + // Long.MAX_VALUE "11110111 00100001 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00100111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 01111111", @@ -4657,6 +4666,27 @@ public void readTimestampValueLongForm(@ConvertWith(StringToTimestamp.class) Tim assertIonTimestampCorrectlyParsed(false, expectedValue, inputBits); } + /** + * Verifies that the reader fails to parse a value as a timestamp. + */ + private void failOnInvalidTimestamp(String inputBits, boolean constructFromBytes) throws Exception { + reader = readerForIon11(bitStringToByteArray(inputBits), constructFromBytes); + assertSequence(next(IonType.TIMESTAMP)); + assertThrows(IllegalArgumentException.class, () -> reader.timestampValue()); + } + + @ParameterizedTest + @CsvSource({ + // YYYYYYYY MMYYYYYY HDDDDDMM mmmmHHHH oooooomm ssoooooo ....ssss Scale+ Coefficient + // 1947-12-23T23:59:00.128d-2 (fraction greater than 1) + "11110111 00010011 10011011 00000111 11011111 10111011 10000011 00010110 00000000 00000101 10000000", + + }) + public void failOnInvalidLongFormTimestamp(String inputBits) throws Exception { + failOnInvalidTimestamp(inputBits, true); + failOnInvalidTimestamp(inputBits, false); + } + /** * Checks that the reader reads the expected string value from the given input bits. */