From 84987ed1229e6e98a29670a700fc11b937cef16d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Sat, 28 Sep 2024 04:52:16 +0800 Subject: [PATCH 001/101] Initial implementation of toggling explicit MV entry size for MVFixedByteRawFwdIndex --- .../MultiValueFixedByteRawIndexCreator.java | 33 +++-- .../FixedByteChunkMVForwardIndexReader.java | 19 +-- ...ultiValueFixedByteRawIndexCreatorTest.java | 118 ++++++++++-------- 3 files changed, 100 insertions(+), 70 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index dcdcb9970516..c7a4dafca9a5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -39,6 +39,7 @@ public class MultiValueFixedByteRawIndexCreator implements ForwardIndexCreator { private final VarByteChunkWriter _indexWriter; private final DataType _valueType; + private boolean _explicitMVEntrySize = true; /** * Create a var-byte raw index creator for the given column @@ -101,6 +102,14 @@ public boolean isSingleValue() { return false; } + public boolean isExplicitMVEntrySize() { + return _explicitMVEntrySize; + } + + public void setExplicitMVEntrySize(boolean explicitMVEntrySize) { + _explicitMVEntrySize = explicitMVEntrySize; + } + @Override public DataType getValueType() { return _valueType; @@ -110,8 +119,10 @@ public DataType getValueType() { public void putIntMV(int[] values) { byte[] bytes = new byte[Integer.BYTES + values.length * Integer.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); + if (_explicitMVEntrySize) { + //write the length + byteBuffer.putInt(values.length); + } //write the content of each element for (int value : values) { byteBuffer.putInt(value); @@ -123,8 +134,10 @@ public void putIntMV(int[] values) { public void putLongMV(long[] values) { byte[] bytes = new byte[Integer.BYTES + values.length * Long.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); + if (_explicitMVEntrySize) { + //write the length + byteBuffer.putInt(values.length); + } //write the content of each element for (long value : values) { byteBuffer.putLong(value); @@ -136,8 +149,10 @@ public void putLongMV(long[] values) { public void putFloatMV(float[] values) { byte[] bytes = new byte[Integer.BYTES + values.length * Float.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); + if (_explicitMVEntrySize) { + //write the length + byteBuffer.putInt(values.length); + } //write the content of each element for (float value : values) { byteBuffer.putFloat(value); @@ -149,8 +164,10 @@ public void putFloatMV(float[] values) { public void putDoubleMV(double[] values) { byte[] bytes = new byte[Integer.BYTES + values.length * Double.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); + if (_explicitMVEntrySize) { + //write the length + byteBuffer.putInt(values.length); + } //write the content of each element for (double value : values) { byteBuffer.putDouble(value); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index f96ed6e878a6..2682c2392edf 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -35,6 +35,7 @@ public final class FixedByteChunkMVForwardIndexReader extends BaseChunkForwardIn private static final int ROW_OFFSET_SIZE = VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE; private final int _maxChunkSize; + private boolean _explicitMVEntrySize = true; public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType) { super(dataBuffer, storedType, false); @@ -54,7 +55,7 @@ public ChunkReaderContext createContext() { @Override public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); } @@ -64,7 +65,7 @@ public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { @Override public int[] getIntMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); @@ -75,7 +76,7 @@ public int[] getIntMV(int docId, ChunkReaderContext context) { @Override public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); } @@ -85,7 +86,7 @@ public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) @Override public long[] getLongMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; long[] valueBuffer = new long[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); @@ -96,7 +97,7 @@ public long[] getLongMV(int docId, ChunkReaderContext context) { @Override public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); } @@ -106,7 +107,7 @@ public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context @Override public float[] getFloatMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; float[] valueBuffer = new float[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); @@ -117,7 +118,7 @@ public float[] getFloatMV(int docId, ChunkReaderContext context) { @Override public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); } @@ -127,7 +128,7 @@ public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext conte @Override public double[] getDoubleMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; double[] valueBuffer = new double[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); @@ -138,7 +139,7 @@ public double[] getDoubleMV(int docId, ChunkReaderContext context) { @Override public int getNumValuesMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - return byteBuffer.getInt(); + return _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining(); } private ByteBuffer slice(int docId, ChunkReaderContext context) { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 33c8525a42d2..affbfd0b10ed 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,6 +54,8 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); + private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; + @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -78,77 +80,86 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, + boolean explicitMVEntrySize) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -158,6 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, maxElements, false, writerVersion, 1024 * 1024, 1000); + creator.setExplicitMVEntrySize(explicitMVEntrySize); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); From d654fd9ce97e26692966b3b4783cf62bbfe60bb7 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:24:41 +0800 Subject: [PATCH 002/101] Fixed uncovered code paths exposed via unit test --- .../impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../MultiValueFixedByteRawIndexCreator.java | 54 ++++++++++++++----- .../FixedByteChunkMVForwardIndexReader.java | 12 ++++- .../VarByteChunkForwardIndexReaderV4.java | 23 +++++--- ...ultiValueFixedByteRawIndexCreatorTest.java | 7 +-- 5 files changed, 73 insertions(+), 25 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index c681265ffb9f..e15bb49d0ee5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index c7a4dafca9a5..46dd09852bc0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -41,6 +41,29 @@ public class MultiValueFixedByteRawIndexCreator implements ForwardIndexCreator { private final DataType _valueType; private boolean _explicitMVEntrySize = true; + /** + * Create a var-byte raw index creator for the given column + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression to use + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk + * @param explicitMVEntrySize true if writer should use explicit integer entry size + * @param writerVersion writer format version + * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when + * deriveNumDocsPerChunk is true + */ + public MultiValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionType compressionType, String column, + int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, + boolean explicitMVEntrySize, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + this(new File(baseIndexDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION), compressionType, totalDocs, + valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, explicitMVEntrySize, writerVersion, targetMaxChunkSizeBytes, + targetDocsPerChunk); + } + /** * Create a var-byte raw index creator for the given column * @@ -59,25 +82,28 @@ public MultiValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionTyp int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { this(new File(baseIndexDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION), compressionType, totalDocs, - valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, + valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, true, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); } public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, boolean explicitMVEntrySize, + int writerVersion) throws IOException { this(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion, ForwardIndexConfig.DEFAULT_TARGET_MAX_CHUNK_SIZE_BYTES, + explicitMVEntrySize, writerVersion, ForwardIndexConfig.DEFAULT_TARGET_MAX_CHUNK_SIZE_BYTES, ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } - public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, - int targetMaxChunkSizeBytes, int targetDocsPerChunk) + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, boolean explicitMVEntrySize, + int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { - // Store the length followed by the values - int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); + // Store the length followed by the value if explicit MV entry size is enabled + _explicitMVEntrySize = explicitMVEntrySize; + int totalMaxLength = + _explicitMVEntrySize ? Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()) + : (maxNumberOfMultiValueElements * valueType.getStoredType().size()); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength + VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE), 1) : targetDocsPerChunk; @@ -117,7 +143,8 @@ public DataType getValueType() { @Override public void putIntMV(int[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Integer.BYTES]; + int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; + byte[] bytes = new byte[lengthSize + values.length * Integer.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); if (_explicitMVEntrySize) { //write the length @@ -132,7 +159,8 @@ public void putIntMV(int[] values) { @Override public void putLongMV(long[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Long.BYTES]; + int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; + byte[] bytes = new byte[lengthSize + values.length * Long.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); if (_explicitMVEntrySize) { //write the length @@ -147,7 +175,8 @@ public void putLongMV(long[] values) { @Override public void putFloatMV(float[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Float.BYTES]; + int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; + byte[] bytes = new byte[lengthSize + values.length * Float.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); if (_explicitMVEntrySize) { //write the length @@ -162,7 +191,8 @@ public void putFloatMV(float[] values) { @Override public void putDoubleMV(double[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Double.BYTES]; + int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; + byte[] bytes = new byte[lengthSize + values.length * Double.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); if (_explicitMVEntrySize) { //write the length diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index 2682c2392edf..af0e2c954b77 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,6 +42,11 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } + public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { + this(dataBuffer, storedType); + _explicitMVEntrySize = explicitMVEntrySize; + } + @Nullable @Override public ChunkReaderContext createContext() { @@ -139,7 +144,12 @@ public double[] getDoubleMV(int docId, ChunkReaderContext context) { @Override public int getNumValuesMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - return _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining(); + if (false == _explicitMVEntrySize) { + throw new IllegalArgumentException( + "Does not support retrieving num values MV doc without specifying type when explicit MV entry size is not " + + "enabled"); + } + return byteBuffer.getInt(); } private ByteBuffer slice(int docId, ChunkReaderContext context) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index a7fadab8c356..7d0da6984872 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -59,6 +59,7 @@ public class VarByteChunkForwardIndexReaderV4 private final PinotDataBuffer _chunks; private final boolean _isSingleValue; private final long _chunksStartOffset; + private boolean _explicitMVEntrySize = true; public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue) { @@ -76,6 +77,12 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da _isSingleValue = isSingleValue; } + public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue, boolean explicitMVEntrySize) { + this(dataBuffer, storedType, isSingleValue); + _explicitMVEntrySize = explicitMVEntrySize; + } + @Override public boolean isDictionaryEncoded() { return false; @@ -124,7 +131,7 @@ public byte[] getBytes(int docId, ReaderContext context) { @Override public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); } @@ -134,7 +141,7 @@ public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReader @Override public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt(): byteBuffer.remaining() / Integer.BYTES; int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); @@ -145,7 +152,7 @@ public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext @Override public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); } @@ -155,7 +162,7 @@ public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexRead @Override public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; long[] valueBuffer = new long[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); @@ -166,7 +173,7 @@ public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContex @Override public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); } @@ -176,7 +183,7 @@ public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexRe @Override public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; float[] valueBuffer = new float[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); @@ -187,7 +194,7 @@ public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderCont @Override public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); } @@ -197,7 +204,7 @@ public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndex @Override public double[] getDoubleMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = byteBuffer.getInt(); + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; double[] valueBuffer = new double[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index affbfd0b10ed..2a5567efa464 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; @@ -168,8 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - creator.setExplicitMVEntrySize(explicitMVEntrySize); + maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -177,7 +177,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + dataType.getStoredType(), false, explicitMVEntrySize) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); From 3d4b99b4381bf5a11d35b333ac916dba3e2d5c31 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:31:50 +0800 Subject: [PATCH 003/101] Fix style issue --- .../creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java | 4 ++-- .../readers/forward/FixedByteChunkMVForwardIndexReader.java | 5 +++-- .../readers/forward/VarByteChunkForwardIndexReaderV4.java | 2 +- .../creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 46dd09852bc0..d81b5475c498 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -60,8 +60,8 @@ public MultiValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionTyp boolean explicitMVEntrySize, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { this(new File(baseIndexDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION), compressionType, totalDocs, - valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, explicitMVEntrySize, writerVersion, targetMaxChunkSizeBytes, - targetDocsPerChunk); + valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, explicitMVEntrySize, writerVersion, + targetMaxChunkSizeBytes, targetDocsPerChunk); } /** diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index af0e2c954b77..a2916742a3e0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,7 +42,8 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } - public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { + public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, + boolean explicitMVEntrySize) { this(dataBuffer, storedType); _explicitMVEntrySize = explicitMVEntrySize; } @@ -144,7 +145,7 @@ public double[] getDoubleMV(int docId, ChunkReaderContext context) { @Override public int getNumValuesMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - if (false == _explicitMVEntrySize) { + if (!_explicitMVEntrySize) { throw new IllegalArgumentException( "Does not support retrieving num values MV doc without specifying type when explicit MV entry size is not " + "enabled"); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 7d0da6984872..50eda77804bf 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -141,7 +141,7 @@ public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReader @Override public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt(): byteBuffer.remaining() / Integer.BYTES; + int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 2a5567efa464..b32281f9302c 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; -import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; From 8c967b53fb8fd66a8484115eee3bcde695c60a54 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:21:34 +0800 Subject: [PATCH 004/101] Refactored code to use new class versions. --- .../impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../MultiValueFixedByteRawIndexCreator.java | 92 ++++--------- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ++++++++++++++++ .../FixedByteChunkMVForwardIndexReader.java | 36 ++--- .../FixedByteChunkMVForwardIndexReaderV2.java | 45 +++++++ .../VarByteChunkForwardIndexReaderV4.java | 24 ++-- .../VarByteChunkForwardIndexReaderV5.java | 47 +++++++ ...ultiValueFixedByteRawIndexCreatorTest.java | 124 ++++++++---------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 52 ++++++++ 9 files changed, 369 insertions(+), 169 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index e15bb49d0ee5..c681265ffb9f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index d81b5475c498..f11125702ba4 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -37,32 +37,8 @@ */ public class MultiValueFixedByteRawIndexCreator implements ForwardIndexCreator { - private final VarByteChunkWriter _indexWriter; + protected final VarByteChunkWriter _indexWriter; private final DataType _valueType; - private boolean _explicitMVEntrySize = true; - - /** - * Create a var-byte raw index creator for the given column - * - * @param baseIndexDir Index directory - * @param compressionType Type of compression to use - * @param column Name of column to index - * @param totalDocs Total number of documents to index - * @param valueType Type of the values - * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk - * @param explicitMVEntrySize true if writer should use explicit integer entry size - * @param writerVersion writer format version - * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when - * deriveNumDocsPerChunk is true - */ - public MultiValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionType compressionType, String column, - int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, - boolean explicitMVEntrySize, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - this(new File(baseIndexDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION), compressionType, totalDocs, - valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, explicitMVEntrySize, writerVersion, - targetMaxChunkSizeBytes, targetDocsPerChunk); - } /** * Create a var-byte raw index creator for the given column @@ -82,28 +58,24 @@ public MultiValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionTyp int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { this(new File(baseIndexDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION), compressionType, totalDocs, - valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, true, writerVersion, targetMaxChunkSizeBytes, + valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); } public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, boolean explicitMVEntrySize, - int writerVersion) + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) throws IOException { this(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - explicitMVEntrySize, writerVersion, ForwardIndexConfig.DEFAULT_TARGET_MAX_CHUNK_SIZE_BYTES, + writerVersion, ForwardIndexConfig.DEFAULT_TARGET_MAX_CHUNK_SIZE_BYTES, ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } + public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, boolean explicitMVEntrySize, - int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, + int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { - // Store the length followed by the value if explicit MV entry size is enabled - _explicitMVEntrySize = explicitMVEntrySize; - int totalMaxLength = - _explicitMVEntrySize ? Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()) - : (maxNumberOfMultiValueElements * valueType.getStoredType().size()); + int totalMaxLength = computeTotalMaxLength(maxNumberOfMultiValueElements, valueType); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength + VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE), 1) : targetDocsPerChunk; @@ -118,6 +90,10 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c _valueType = valueType; } + protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { + return Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); + } + @Override public boolean isDictionaryEncoded() { return false; @@ -128,14 +104,6 @@ public boolean isSingleValue() { return false; } - public boolean isExplicitMVEntrySize() { - return _explicitMVEntrySize; - } - - public void setExplicitMVEntrySize(boolean explicitMVEntrySize) { - _explicitMVEntrySize = explicitMVEntrySize; - } - @Override public DataType getValueType() { return _valueType; @@ -143,13 +111,10 @@ public DataType getValueType() { @Override public void putIntMV(int[] values) { - int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; - byte[] bytes = new byte[lengthSize + values.length * Integer.BYTES]; + byte[] bytes = new byte[Integer.BYTES + values.length * Integer.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - if (_explicitMVEntrySize) { - //write the length - byteBuffer.putInt(values.length); - } + //write the length + byteBuffer.putInt(values.length); //write the content of each element for (int value : values) { byteBuffer.putInt(value); @@ -159,13 +124,10 @@ public void putIntMV(int[] values) { @Override public void putLongMV(long[] values) { - int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; - byte[] bytes = new byte[lengthSize + values.length * Long.BYTES]; + byte[] bytes = new byte[Integer.BYTES + values.length * Long.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - if (_explicitMVEntrySize) { - //write the length - byteBuffer.putInt(values.length); - } + //write the length + byteBuffer.putInt(values.length); //write the content of each element for (long value : values) { byteBuffer.putLong(value); @@ -175,13 +137,10 @@ public void putLongMV(long[] values) { @Override public void putFloatMV(float[] values) { - int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; - byte[] bytes = new byte[lengthSize + values.length * Float.BYTES]; + byte[] bytes = new byte[Integer.BYTES + values.length * Float.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - if (_explicitMVEntrySize) { - //write the length - byteBuffer.putInt(values.length); - } + //write the length + byteBuffer.putInt(values.length); //write the content of each element for (float value : values) { byteBuffer.putFloat(value); @@ -191,13 +150,10 @@ public void putFloatMV(float[] values) { @Override public void putDoubleMV(double[] values) { - int lengthSize = _explicitMVEntrySize ? Integer.BYTES : 0; - byte[] bytes = new byte[lengthSize + values.length * Double.BYTES]; + byte[] bytes = new byte[Integer.BYTES + values.length * Double.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - if (_explicitMVEntrySize) { - //write the length - byteBuffer.putInt(values.length); - } + //write the length + byteBuffer.putInt(values.length); //write the content of each element for (double value : values) { byteBuffer.putDouble(value); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java new file mode 100644 index 000000000000..90bc9281d23a --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator.impl.fwd; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. + */ +public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { + /** + * Create a var-byte raw index creator for the given column + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression to use + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk + * @param writerVersion writer format version + * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when + * deriveNumDocsPerChunk is true + */ + public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, + int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, + int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, + deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, + int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + @Override + protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { + return maxNumberOfMultiValueElements * valueType.getStoredType().size(); + } + + @Override + public void putIntMV(int[] values) { + byte[] bytes = new byte[values.length * Integer.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (int value : values) { + byteBuffer.putInt(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putLongMV(long[] values) { + byte[] bytes = new byte[values.length * Long.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (long value : values) { + byteBuffer.putLong(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putFloatMV(float[] values) { + byte[] bytes = new byte[values.length * Float.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (float value : values) { + byteBuffer.putFloat(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putDoubleMV(double[] values) { + byte[] bytes = new byte[values.length * Double.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (double value : values) { + byteBuffer.putDouble(value); + } + _indexWriter.putBytes(bytes); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index a2916742a3e0..b99eba94ce0f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -31,23 +31,16 @@ * LONG, FLOAT, DOUBLE). *

For data layout, please refer to the documentation for {@link VarByteChunkForwardIndexWriter} */ -public final class FixedByteChunkMVForwardIndexReader extends BaseChunkForwardIndexReader { +public class FixedByteChunkMVForwardIndexReader extends BaseChunkForwardIndexReader { private static final int ROW_OFFSET_SIZE = VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE; private final int _maxChunkSize; - private boolean _explicitMVEntrySize = true; public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType) { super(dataBuffer, storedType, false); _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } - public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, - boolean explicitMVEntrySize) { - this(dataBuffer, storedType); - _explicitMVEntrySize = explicitMVEntrySize; - } - @Nullable @Override public ChunkReaderContext createContext() { @@ -61,7 +54,7 @@ public ChunkReaderContext createContext() { @Override public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; + int numValues = getNumValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); } @@ -71,7 +64,7 @@ public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { @Override public int[] getIntMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; + int numValues = getNumValuesMV(byteBuffer); int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); @@ -82,7 +75,7 @@ public int[] getIntMV(int docId, ChunkReaderContext context) { @Override public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; + int numValues = getNumValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); } @@ -92,7 +85,7 @@ public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) @Override public long[] getLongMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; + int numValues = getNumValuesMV(byteBuffer); long[] valueBuffer = new long[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); @@ -103,7 +96,7 @@ public long[] getLongMV(int docId, ChunkReaderContext context) { @Override public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; + int numValues = getNumValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); } @@ -113,7 +106,7 @@ public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context @Override public float[] getFloatMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; + int numValues = getNumValuesMV(byteBuffer); float[] valueBuffer = new float[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); @@ -124,7 +117,7 @@ public float[] getFloatMV(int docId, ChunkReaderContext context) { @Override public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; + int numValues = getNumValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); } @@ -134,7 +127,7 @@ public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext conte @Override public double[] getDoubleMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; + int numValues = getNumValuesMV(byteBuffer); double[] valueBuffer = new double[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); @@ -145,15 +138,14 @@ public double[] getDoubleMV(int docId, ChunkReaderContext context) { @Override public int getNumValuesMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - if (!_explicitMVEntrySize) { - throw new IllegalArgumentException( - "Does not support retrieving num values MV doc without specifying type when explicit MV entry size is not " - + "enabled"); - } return byteBuffer.getInt(); } - private ByteBuffer slice(int docId, ChunkReaderContext context) { + protected int getNumValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.getInt(); + } + + protected ByteBuffer slice(int docId, ChunkReaderContext context) { if (_isCompressed) { return sliceBytesCompressed(docId, context); } else { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java new file mode 100644 index 000000000000..762672928d72 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred + */ +public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { + + public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { + super(dataBuffer, storedType); + } + + @Override + public int getNumValuesMV(int docId, ChunkReaderContext context) { + ByteBuffer byteBuffer = slice(docId, context); + return getNumValuesMV(byteBuffer); + } + + @Override + protected int getNumValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 50eda77804bf..981deeeb793a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -50,7 +50,7 @@ public class VarByteChunkForwardIndexReaderV4 private static final Logger LOGGER = LoggerFactory.getLogger(VarByteChunkForwardIndexReaderV4.class); private static final int METADATA_ENTRY_SIZE = 8; - private final FieldSpec.DataType _storedType; + protected final FieldSpec.DataType _storedType; private final int _targetDecompressedChunkSize; private final ChunkDecompressor _chunkDecompressor; private final ChunkCompressionType _chunkCompressionType; @@ -59,7 +59,6 @@ public class VarByteChunkForwardIndexReaderV4 private final PinotDataBuffer _chunks; private final boolean _isSingleValue; private final long _chunksStartOffset; - private boolean _explicitMVEntrySize = true; public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue) { @@ -80,7 +79,6 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue, boolean explicitMVEntrySize) { this(dataBuffer, storedType, isSingleValue); - _explicitMVEntrySize = explicitMVEntrySize; } @Override @@ -128,10 +126,14 @@ public byte[] getBytes(int docId, ReaderContext context) { return context.getValue(docId); } + protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.getInt(); + } + @Override public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); } @@ -141,7 +143,7 @@ public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReader @Override public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Integer.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); @@ -152,7 +154,7 @@ public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext @Override public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); } @@ -162,7 +164,7 @@ public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexRead @Override public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Long.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); long[] valueBuffer = new long[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); @@ -173,7 +175,7 @@ public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContex @Override public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); } @@ -183,7 +185,7 @@ public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexRe @Override public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Float.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); float[] valueBuffer = new float[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); @@ -194,7 +196,7 @@ public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderCont @Override public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); } @@ -204,7 +206,7 @@ public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndex @Override public double[] getDoubleMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { ByteBuffer byteBuffer = ByteBuffer.wrap(context.getValue(docId)); - int numValues = _explicitMVEntrySize ? byteBuffer.getInt() : byteBuffer.remaining() / Double.BYTES; + int numValues = getNumFixedByteValuesMV(byteBuffer); double[] valueBuffer = new double[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java new file mode 100644 index 000000000000..0a8a4527bebc --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + * Chunk-based raw (non-dictionary-encoded) forward index reader for values of SV variable length data types + * (BIG_DECIMAL, STRING, BYTES), MV fixed length and MV variable length data types. + *

For data layout, please refer to the documentation for {@link VarByteChunkForwardIndexWriterV4} + */ +public class VarByteChunkForwardIndexReaderV5 extends VarByteChunkForwardIndexReaderV4 { + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue) { + super(dataBuffer, storedType, isSingleValue); + } + + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue, boolean explicitMVEntrySize) { + super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); + } + + @Override + protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index b32281f9302c..c17c77a8a874 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,8 +54,6 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); - private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; - @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -80,86 +78,78 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, - boolean explicitMVEntrySize) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -168,7 +158,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); + maxElements, false, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -176,8 +166,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false, explicitMVEntrySize) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java new file mode 100644 index 000000000000..e4937a12ab3f --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.function.IntFunction; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.spi.V1Constants.Indexes; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + // TODO: +} From c2359ec55d7460426611e7a93fd88faa041f7181 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:25:29 +0800 Subject: [PATCH 005/101] Fixed style. --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index e4937a12ab3f..8a3e2e4fc65e 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,35 +18,6 @@ */ package org.apache.pinot.segment.local.segment.index.creator; -import java.io.File; -import java.io.IOException; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import java.util.function.IntFunction; -import java.util.function.ToIntFunction; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; -import org.apache.pinot.segment.spi.V1Constants.Indexes; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; -import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - - public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { // TODO: } From dd3410f61bcd3b560093e45c62699f528403e7a7 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:32:21 +0800 Subject: [PATCH 006/101] Refactored MultiValueFixedByteRawIndexCreatorTest.java --- ...MultiValueFixedByteRawIndexCreatorTest.java | 11 +++++++---- ...ltiValueFixedByteRawIndexCreatorV2Test.java | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index c17c77a8a874..78d72b49e452 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -164,10 +164,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo //read final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); - ForwardIndexReader reader = - writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + ForwardIndexReader reader = getForwardIndexReader(buffer, dataType, writerVersion); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); @@ -189,6 +186,12 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 8a3e2e4fc65e..1c90b8092f6a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,6 +18,22 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - // TODO: + @Override + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + } } From 0c0df847ad78dd741bc3cdbae136238f56bd9374 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:36:46 +0800 Subject: [PATCH 007/101] Fix style. --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 1c90b8092f6a..c07257901009 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -31,9 +31,9 @@ */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } } From e7e091b7d8017b576e960a86f3dee8e49721e04e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 14:25:36 +0800 Subject: [PATCH 008/101] Modified existing unit test and extended it for MultiValueFixedByteRawIndexCreatorV2Test.java --- ...ultiValueFixedByteRawIndexCreatorTest.java | 31 +++++++++++-------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 27 ++++++++++++++-- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 78d72b49e452..8be27bf0fe09 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -49,8 +49,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { - private static final String OUTPUT_DIR = - System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + protected static String _outputDir; private static final Random RANDOM = new Random(); @@ -64,7 +63,8 @@ public Object[][] compressionTypes() { @BeforeClass public void setup() throws Exception { - FileUtils.forceMkdir(new File(OUTPUT_DIR)); + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + FileUtils.forceMkdir(new File(_outputDir)); } /** @@ -72,7 +72,7 @@ public void setup() */ @AfterClass public void cleanup() { - FileUtils.deleteQuietly(new File(OUTPUT_DIR)); + FileUtils.deleteQuietly(new File(_outputDir)); } @Test(dataProvider = "compressionTypes") @@ -148,17 +148,28 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion }, compressionType, writerVersion); } + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator(ChunkCompressionType compressionType, + String column, int numDocs, DataType dataType, int maxElements, int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreator(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); int maxElements = inputs.stream().mapToInt(sizeof).max().orElseThrow(RuntimeException::new); - File file = new File(OUTPUT_DIR, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); + File file = new File(_outputDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); file.delete(); MultiValueFixedByteRawIndexCreator creator = - new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); + getMultiValueFixedByteRawIndexCreator(compressionType, column, numDocs, dataType, maxElements, writerVersion); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -186,12 +197,6 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); - } - interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index c07257901009..eae997014e22 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,20 +18,43 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import java.io.File; +import java.io.IOException; +import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.annotations.BeforeClass; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + @BeforeClass + public void setup() + throws Exception { + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; + FileUtils.forceMkdir(new File(_outputDir)); + } + + @Override + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( + ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, + int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); From 153be163581ae90edbdb11aeb0d477393d5922b9 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 18:15:33 +0800 Subject: [PATCH 009/101] Improved unit test for MultiValueFixedByteRawIndexCreatorTest and MultiValueFixedByteRawIndexCreatorV2Test --- ...ultiValueFixedByteRawIndexCreatorTest.java | 2 +- ...tiValueFixedByteRawIndexCreatorV2Test.java | 63 ++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 8be27bf0fe09..1b4e4e9368a1 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -51,7 +51,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { protected static String _outputDir; - private static final Random RANDOM = new Random(); + protected static final Random RANDOM = new Random(); @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index eae997014e22..dd3d0b1156b8 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -20,6 +20,8 @@ import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -30,11 +32,15 @@ import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used + Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and + reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with + the forward index creator version upgrade. */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @BeforeClass @@ -59,4 +65,59 @@ public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpe return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + for (int writerVersion : List.of(2, 4)) { + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( + explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = + new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( + implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: + // 1. At least 15% larger than the implicit length variant when using Writer Version 2 + // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + long expectedImplicitLengthFwdIndexMaxSize; + if (writerVersion == 2) { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); + } else { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + } + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + } } From 0233905dd8e56f6ea8da4d4b3ccd401580c5ed59 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:17:42 +0800 Subject: [PATCH 010/101] Remove redundant blank line --- .../index/creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 1b4e4e9368a1..9a2105726aa7 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -139,7 +139,6 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion return Arrays.copyOf(buffer, length); }, compressionType, writerVersion); - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { From 69defe13cbdc541a5bcfafaec1525c061371a390 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:45:49 +0800 Subject: [PATCH 011/101] Adjusted comments content --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index dd3d0b1156b8..f7105630c5e3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -109,8 +109,8 @@ public void validateCompressionRatioIncrease() } // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 15% larger than the implicit length variant when using Writer Version 2 - // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 + // 2. At least 2x larger than the implicit length variant when using Writer Version 4 long expectedImplicitLengthFwdIndexMaxSize; if (writerVersion == 2) { expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); From e1173c08a76a34859b2f6c6de3997190197328a4 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 3 Oct 2024 16:22:53 +0800 Subject: [PATCH 012/101] Removed redundant constructor missed during refactoring. --- .../readers/forward/VarByteChunkForwardIndexReaderV4.java | 5 ----- .../readers/forward/VarByteChunkForwardIndexReaderV5.java | 5 ----- 2 files changed, 10 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 981deeeb793a..f2b64a83b1de 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -76,11 +76,6 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da _isSingleValue = isSingleValue; } - public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, - boolean isSingleValue, boolean explicitMVEntrySize) { - this(dataBuffer, storedType, isSingleValue); - } - @Override public boolean isDictionaryEncoded() { return false; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 0a8a4527bebc..20569bf7ff4e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -35,11 +35,6 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } - public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, - boolean isSingleValue, boolean explicitMVEntrySize) { - super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); - } - @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From 34ac7865adb2531b0de119ea9127e415f4e281de Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 05:51:53 +0800 Subject: [PATCH 013/101] Upgrade MVFixedByteRawIndex reader and writer from V4 to V5, retain forward index creator version. --- .../VarByteChunkForwardIndexWriterV4.java | 12 +- .../VarByteChunkForwardIndexWriterV5.java | 55 +++++++++ .../MultiValueFixedByteRawIndexCreator.java | 40 +++--- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ------------------ .../forward/ForwardIndexReaderFactory.java | 7 ++ .../VarByteChunkForwardIndexReaderV4.java | 8 +- .../VarByteChunkForwardIndexReaderV5.java | 8 ++ 7 files changed, 107 insertions(+), 139 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index 440808a6b0bd..f677964f2aad 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -77,11 +77,13 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; - private static final Logger LOGGER = LoggerFactory.getLogger(VarByteChunkForwardIndexWriterV4.class); + // Use the run-time concrete class to retrieve the logger + protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + private static final String DATA_BUFFER_SUFFIX = ".buf"; private final File _dataBuffer; - private final RandomAccessFile _output; + protected final RandomAccessFile _output; private final FileChannel _dataChannel; private final ByteBuffer _chunkBuffer; private final ByteBuffer _compressionBuffer; @@ -106,11 +108,15 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress writeHeader(_chunkCompressor.compressionType(), chunkSize); } + public int getVersion() { + return VERSION; + } + private void writeHeader(ChunkCompressionType compressionType, int targetDecompressedChunkSize) throws IOException { // keep metadata BE for backwards compatibility // (e.g. the version needs to be read by a factory which assumes BE) - _output.writeInt(VERSION); + _output.writeInt(getVersion()); _output.writeInt(targetDecompressedChunkSize); _output.writeInt(compressionType.getValue()); // reserve a slot to write the data offset into diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java new file mode 100644 index 000000000000..72c94e210139 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.writer.impl; + +import java.io.File; +import java.io.IOException; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; + + +/** + * Forward index writer that extends {@link VarByteChunkForwardIndexWriterV4} with the only difference being the + * version tag is now bumped from 4 to 5. + * + *

The {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static variables + * are shadowed in the child class thus associated with the class that defines them, care must be taken to ensure + * that the parent class can correctly observe the child class's {@code VERSION} value at runtime.

+ * + *

To achieve this, the {@code getVersion()} method is overridden to return the concrete subclass's + * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference + * to the parent class.

+ * + * @see VarByteChunkForwardIndexWriterV4 + * @see VarByteChunkForwardIndexWriterV5#getVersion() + */ +@NotThreadSafe +public class VarByteChunkForwardIndexWriterV5 extends VarByteChunkForwardIndexWriterV4 { + public static final int VERSION = 5; + + public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compressionType, int chunkSize) + throws IOException { + super(file, compressionType, chunkSize); + } + + @Override + public int getVersion() { + return VERSION; + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index f11125702ba4..3870998a8f2f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -23,6 +23,7 @@ import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriter; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkWriter; import org.apache.pinot.segment.spi.V1Constants.Indexes; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -39,6 +40,7 @@ public class MultiValueFixedByteRawIndexCreator implements ForwardIndexCreator { protected final VarByteChunkWriter _indexWriter; private final DataType _valueType; + private final boolean _writeExplicitNumValueCount; /** * Create a var-byte raw index creator for the given column @@ -70,11 +72,11 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } - public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { + _writeExplicitNumValueCount = writerVersion < VarByteChunkForwardIndexWriterV5.VERSION; int totalMaxLength = computeTotalMaxLength(maxNumberOfMultiValueElements, valueType); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength @@ -85,7 +87,9 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c } else { int chunkSize = ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); - _indexWriter = new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize); + _indexWriter = + (writerVersion == VarByteChunkForwardIndexWriterV4.VERSION) ? new VarByteChunkForwardIndexWriterV4(indexFile, + compressionType, chunkSize) : new VarByteChunkForwardIndexWriterV5(indexFile, compressionType, chunkSize); } _valueType = valueType; } @@ -111,11 +115,11 @@ public DataType getValueType() { @Override public void putIntMV(int[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Integer.BYTES]; + byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Integer.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); - //write the content of each element + // write the length when necessary + if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + // write the content of each element for (int value : values) { byteBuffer.putInt(value); } @@ -124,11 +128,11 @@ public void putIntMV(int[] values) { @Override public void putLongMV(long[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Long.BYTES]; + byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Long.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); - //write the content of each element + // write the length when necessary + if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + // write the content of each element for (long value : values) { byteBuffer.putLong(value); } @@ -137,11 +141,11 @@ public void putLongMV(long[] values) { @Override public void putFloatMV(float[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Float.BYTES]; + byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Float.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); - //write the content of each element + // write the length when necessary + if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + // write the content of each element for (float value : values) { byteBuffer.putFloat(value); } @@ -150,11 +154,11 @@ public void putFloatMV(float[] values) { @Override public void putDoubleMV(double[] values) { - byte[] bytes = new byte[Integer.BYTES + values.length * Double.BYTES]; + byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Double.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the length - byteBuffer.putInt(values.length); - //write the content of each element + // write the length when necessary + if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + // write the content of each element for (double value : values) { byteBuffer.putDouble(value); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java deleted file mode 100644 index 90bc9281d23a..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.creator.impl.fwd; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. - */ -public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { - /** - * Create a var-byte raw index creator for the given column - * - * @param baseIndexDir Index directory - * @param compressionType Type of compression to use - * @param column Name of column to index - * @param totalDocs Total number of documents to index - * @param valueType Type of the values - * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk - * @param writerVersion writer format version - * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when - * deriveNumDocsPerChunk is true - */ - public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, - int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, - int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, - deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, - int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - @Override - protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { - return maxNumberOfMultiValueElements * valueType.getStoredType().size(); - } - - @Override - public void putIntMV(int[] values) { - byte[] bytes = new byte[values.length * Integer.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (int value : values) { - byteBuffer.putInt(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putLongMV(long[] values) { - byte[] bytes = new byte[values.length * Long.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (long value : values) { - byteBuffer.putLong(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putFloatMV(float[] values) { - byte[] bytes = new byte[values.length * Float.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (float value : values) { - byteBuffer.putFloat(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putDoubleMV(double[] values) { - byte[] bytes = new byte[values.length * Double.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (double value : values) { - byteBuffer.putDouble(value); - } - _indexWriter.putBytes(bytes); - } -} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index db815761d9ea..a4344b61f0de 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,8 +19,10 @@ package org.apache.pinot.segment.local.segment.index.forward; +import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.CLPForwardIndexCreatorV1; import org.apache.pinot.segment.local.segment.index.readers.forward.CLPForwardIndexReaderV1; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBitMVEntryDictForwardIndexReader; @@ -30,6 +32,7 @@ import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBytePower2ChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkMVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.sorted.SortedIndexReaderImpl; @@ -106,6 +109,10 @@ public static ForwardIndexReader createRawIndexReader(PinotDataBuffer dataBuffer : new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType); } + if (version >= VarByteChunkForwardIndexWriterV5.VERSION) { + // V5 is the same as V4 except the multi-value docs have implicit value count rather than explicit + return new VarByteChunkForwardIndexReaderV5(dataBuffer, storedType, isSingleValue); + } if (version == VarByteChunkForwardIndexWriterV4.VERSION) { // V4 reader is common for sv var byte, mv fixed byte and mv var byte return new VarByteChunkForwardIndexReaderV4(dataBuffer, storedType, isSingleValue); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index f2b64a83b1de..7deba8417bb6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -62,8 +62,7 @@ public class VarByteChunkForwardIndexReaderV4 public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue) { - int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + validateIndexVersion(dataBuffer); _storedType = storedType; _targetDecompressedChunkSize = dataBuffer.getInt(4); _chunkCompressionType = ChunkCompressionType.valueOf(dataBuffer.getInt(8)); @@ -76,6 +75,11 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da _isSingleValue = isSingleValue; } + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + } + @Override public boolean isDictionaryEncoded() { return false; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 20569bf7ff4e..fd4528c27df7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -18,8 +18,10 @@ */ package org.apache.pinot.segment.local.segment.index.readers.forward; +import com.google.common.base.Preconditions; import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; @@ -35,6 +37,12 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } + @Override + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.VERSION, "Illegal index version: %s", version); + } + @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From b09067657e460687de7e58c26f0630b45fd5418c Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 06:01:48 +0800 Subject: [PATCH 014/101] Minor changes in MultiValueFixedByteRawIndexCreator --- .../impl/fwd/MultiValueFixedByteRawIndexCreator.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 3870998a8f2f..511e43b6bc1d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -38,7 +38,7 @@ */ public class MultiValueFixedByteRawIndexCreator implements ForwardIndexCreator { - protected final VarByteChunkWriter _indexWriter; + private final VarByteChunkWriter _indexWriter; private final DataType _valueType; private final boolean _writeExplicitNumValueCount; @@ -77,7 +77,9 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { _writeExplicitNumValueCount = writerVersion < VarByteChunkForwardIndexWriterV5.VERSION; - int totalMaxLength = computeTotalMaxLength(maxNumberOfMultiValueElements, valueType); + int totalMaxLength = + (_writeExplicitNumValueCount ? Integer.BYTES : 0) + (maxNumberOfMultiValueElements * valueType.getStoredType() + .size()); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength + VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE), 1) : targetDocsPerChunk; @@ -88,16 +90,12 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c int chunkSize = ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); _indexWriter = - (writerVersion == VarByteChunkForwardIndexWriterV4.VERSION) ? new VarByteChunkForwardIndexWriterV4(indexFile, + (writerVersion < VarByteChunkForwardIndexWriterV5.VERSION) ? new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize) : new VarByteChunkForwardIndexWriterV5(indexFile, compressionType, chunkSize); } _valueType = valueType; } - protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { - return Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); - } - @Override public boolean isDictionaryEncoded() { return false; From 2ff1914a8d1668c6f76e3355702fd501895496fb Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 06:08:41 +0800 Subject: [PATCH 015/101] Fix minor style issue. --- .../impl/VarByteChunkForwardIndexWriterV4.java | 4 ++-- .../fwd/MultiValueFixedByteRawIndexCreator.java | 16 ++++++++++++---- .../index/forward/ForwardIndexReaderFactory.java | 1 - 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index f677964f2aad..fe1c21d9f9ab 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -78,7 +78,7 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; // Use the run-time concrete class to retrieve the logger - protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + protected final Logger _logger = LoggerFactory.getLogger(this.getClass()); private static final String DATA_BUFFER_SUFFIX = ".buf"; @@ -293,7 +293,7 @@ private void write(ByteBuffer buffer, boolean huge) { _chunkOffset += compressedSize; _docIdOffset = _nextDocId; } catch (IOException e) { - LOGGER.error("Exception caught while compressing/writing data chunk", e); + _logger.error("Exception caught while compressing/writing data chunk", e); throw new RuntimeException(e); } finally { if (mapped != null) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 511e43b6bc1d..e9669e116c93 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -116,7 +116,9 @@ public void putIntMV(int[] values) { byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Integer.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); // write the length when necessary - if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + if (_writeExplicitNumValueCount) { + byteBuffer.putInt(values.length); + } // write the content of each element for (int value : values) { byteBuffer.putInt(value); @@ -129,7 +131,9 @@ public void putLongMV(long[] values) { byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Long.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); // write the length when necessary - if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + if (_writeExplicitNumValueCount) { + byteBuffer.putInt(values.length); + } // write the content of each element for (long value : values) { byteBuffer.putLong(value); @@ -142,7 +146,9 @@ public void putFloatMV(float[] values) { byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Float.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); // write the length when necessary - if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + if (_writeExplicitNumValueCount) { + byteBuffer.putInt(values.length); + } // write the content of each element for (float value : values) { byteBuffer.putFloat(value); @@ -155,7 +161,9 @@ public void putDoubleMV(double[] values) { byte[] bytes = new byte[(_writeExplicitNumValueCount ? Integer.BYTES : 0) + values.length * Double.BYTES]; ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); // write the length when necessary - if (_writeExplicitNumValueCount) byteBuffer.putInt(values.length); + if (_writeExplicitNumValueCount) { + byteBuffer.putInt(values.length); + } // write the content of each element for (double value : values) { byteBuffer.putDouble(value); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index a4344b61f0de..fdf1cfd1b96a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,7 +19,6 @@ package org.apache.pinot.segment.local.segment.index.forward; -import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; From 54b2709ac457a5cdf2e83cc9bb3460333509f22c Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 06:14:35 +0800 Subject: [PATCH 016/101] Refactored FixByteChunkMVForwardIndexReader --- .../FixedByteChunkMVForwardIndexReader.java | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index b99eba94ce0f..f96ed6e878a6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -31,7 +31,7 @@ * LONG, FLOAT, DOUBLE). *

For data layout, please refer to the documentation for {@link VarByteChunkForwardIndexWriter} */ -public class FixedByteChunkMVForwardIndexReader extends BaseChunkForwardIndexReader { +public final class FixedByteChunkMVForwardIndexReader extends BaseChunkForwardIndexReader { private static final int ROW_OFFSET_SIZE = VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE; private final int _maxChunkSize; @@ -54,7 +54,7 @@ public ChunkReaderContext createContext() { @Override public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); } @@ -64,7 +64,7 @@ public int getIntMV(int docId, int[] valueBuffer, ChunkReaderContext context) { @Override public int[] getIntMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); int[] valueBuffer = new int[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getInt(); @@ -75,7 +75,7 @@ public int[] getIntMV(int docId, ChunkReaderContext context) { @Override public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); } @@ -85,7 +85,7 @@ public int getLongMV(int docId, long[] valueBuffer, ChunkReaderContext context) @Override public long[] getLongMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); long[] valueBuffer = new long[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getLong(); @@ -96,7 +96,7 @@ public long[] getLongMV(int docId, ChunkReaderContext context) { @Override public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); } @@ -106,7 +106,7 @@ public int getFloatMV(int docId, float[] valueBuffer, ChunkReaderContext context @Override public float[] getFloatMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); float[] valueBuffer = new float[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getFloat(); @@ -117,7 +117,7 @@ public float[] getFloatMV(int docId, ChunkReaderContext context) { @Override public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); } @@ -127,7 +127,7 @@ public int getDoubleMV(int docId, double[] valueBuffer, ChunkReaderContext conte @Override public double[] getDoubleMV(int docId, ChunkReaderContext context) { ByteBuffer byteBuffer = slice(docId, context); - int numValues = getNumValuesMV(byteBuffer); + int numValues = byteBuffer.getInt(); double[] valueBuffer = new double[numValues]; for (int i = 0; i < numValues; i++) { valueBuffer[i] = byteBuffer.getDouble(); @@ -141,11 +141,7 @@ public int getNumValuesMV(int docId, ChunkReaderContext context) { return byteBuffer.getInt(); } - protected int getNumValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.getInt(); - } - - protected ByteBuffer slice(int docId, ChunkReaderContext context) { + private ByteBuffer slice(int docId, ChunkReaderContext context) { if (_isCompressed) { return sliceBytesCompressed(docId, context); } else { From 318b826bbb39bc7fb141e2ddbc6187178da9ed8e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:35:23 +0800 Subject: [PATCH 017/101] Deleted FixByteChunkMVForwardIndexReaderV2 --- .../FixedByteChunkMVForwardIndexReaderV2.java | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java deleted file mode 100644 index 762672928d72..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.readers.forward; - -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred - */ -public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { - - public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { - super(dataBuffer, storedType); - } - - @Override - public int getNumValuesMV(int docId, ChunkReaderContext context) { - ByteBuffer byteBuffer = slice(docId, context); - return getNumValuesMV(byteBuffer); - } - - @Override - protected int getNumValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); - } -} From a9170b74f57da2f3026b97173ee741eeeb5d0f42 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:44:48 +0800 Subject: [PATCH 018/101] Deleted FixByteChunkMVForwardIndexReaderV2Test --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java deleted file mode 100644 index f7105630c5e3..000000000000 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.creator; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -/** - Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and - reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with - the forward index creator version upgrade. - */ -public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - @BeforeClass - public void setup() - throws Exception { - _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; - FileUtils.forceMkdir(new File(_outputDir)); - } - - @Override - public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( - ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, - int writerVersion) - throws IOException { - return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - } - - @Override - public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, - int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); - } - - @Test - public void validateCompressionRatioIncrease() - throws IOException { - // Generate input data containing short MV docs with somewhat repetitive data - int numDocs = 1000000; - int numElements = 0; - int maxMVRowSize = 0; - List inputData = new ArrayList<>(numDocs); - for (int i = 0; i < numDocs; i++) { - long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; - maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); - numElements += mvRow.length; - for (int j = 0; j < mvRow.length; j++, numElements++) { - mvRow[j] = numElements % 10; - } - inputData.add(mvRow); - } - - for (int writerVersion : List.of(2, 4)) { - // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); - FileUtils.deleteQuietly(explicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( - explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = - new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); - FileUtils.deleteQuietly(implicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( - implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 - // 2. At least 2x larger than the implicit length variant when using Writer Version 4 - long expectedImplicitLengthFwdIndexMaxSize; - if (writerVersion == 2) { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); - } else { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); - } - Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - } -} From d699c2ada03219c36d0cf73057aeb12becc3f0d4 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:22:15 +0800 Subject: [PATCH 019/101] Add VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java new file mode 100644 index 000000000000..bcc71b94dbad --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -0,0 +1,227 @@ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class VarByteChunkV5Test extends VarByteChunkV4Test { + private static final Random RANDOM = new Random(); + private static File[] _dirs; + + @DataProvider(parallel = true) + public Object[][] params() { + Object[][] params = new Object[][]{ + {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null + , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, + ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, + ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + }; + + for (int i = 0; i < _dirs.length; i++) { + params[i][0] = _dirs[i]; + } + + return params; + } + + @BeforeClass + public void forceMkDirs() + throws IOException { + _dirs = new File[10]; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + } + + @AfterClass + public void deleteDirs() { + for (File dir : _dirs) { + FileUtils.deleteQuietly(dir); + } + } + + @Test(dataProvider = "params") + public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringSVFile = new File(file, "testStringSV"); + testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, + VarByteChunkForwardIndexWriterV5::putString, (reader, context, docId) -> reader.getString(docId, context)); + FileUtils.deleteQuietly(stringSVFile); + } + + @Test(dataProvider = "params") + public void testBytesSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesSVFile = new File(file, "testBytesSV"); + testWriteRead(bytesSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, + x -> x.getBytes(StandardCharsets.UTF_8), VarByteChunkForwardIndexWriterV5::putBytes, + (reader, context, docId) -> reader.getBytes(docId, context)); + FileUtils.deleteQuietly(bytesSVFile); + } + + @Test(dataProvider = "params") + public void testStringMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringMVFile = new File(file, "testStringMV"); + testWriteRead(stringMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, + new StringSplitterMV(), VarByteChunkForwardIndexWriterV5::putStringMV, + (reader, context, docId) -> reader.getStringMV(docId, context)); + FileUtils.deleteQuietly(stringMVFile); + } + + @Test(dataProvider = "params") + public void testBytesMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesMVFile = new File(file, "testBytesMV"); + testWriteRead(bytesMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, new ByteSplitterMV(), + VarByteChunkForwardIndexWriterV5::putBytesMV, (reader, context, docId) -> reader.getBytesMV(docId, context)); + FileUtils.deleteQuietly(bytesMVFile); + } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + int writerVersion = 5; + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = _dirs[0]; + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = _dirs[1]; + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be at least + // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 + long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + + static class ByteSplitterMV implements Function { + @Override + public byte[][] apply(String input) { + List res = new ArrayList<>(); + for (int i = 0; i < input.length(); i += 3) { + int endIndex = Math.min(i + 3, input.length()); + res.add(input.substring(i, endIndex).getBytes()); + } + return res.toArray(new byte[0][]); + } + } + + private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, + FieldSpec.DataType dataType, Function forwardMapper, + BiConsumer write, Read read) + throws IOException { + List values = randomStrings(1000, longestEntry).map(forwardMapper).collect(Collectors.toList()); + try (VarByteChunkForwardIndexWriterV5 writer = new VarByteChunkForwardIndexWriterV5(file, compressionType, + chunkSize)) { + for (T value : values) { + write.accept(writer, value); + } + } + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(file)) { + try (VarByteChunkForwardIndexReaderV5 reader = new VarByteChunkForwardIndexReaderV5(buffer, dataType, true); + VarByteChunkForwardIndexReaderV5.ReaderContext context = reader.createContext()) { + for (int i = 0; i < values.size(); i++) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 0; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i--) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 2; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + } + } + } + + private Stream randomStrings(int count, int lengthOfLongestEntry) { + return IntStream.range(0, count).mapToObj(i -> { + int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); + byte[] bytes = new byte[length]; + if (length != 0) { + bytes[bytes.length - 1] = 'c'; + if (length > 2) { + Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); + } + bytes[0] = 'a'; + } + return new String(bytes, StandardCharsets.UTF_8); + }); + } + + @FunctionalInterface + interface Read { + T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); + } +} From 713779275a7be6a6b9aaa0bbc53f8934c6940a0d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:56:48 +0800 Subject: [PATCH 020/101] Add license to VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index bcc71b94dbad..59e336d14384 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; @@ -37,11 +55,16 @@ public class VarByteChunkV5Test extends VarByteChunkV4Test { @DataProvider(parallel = true) public Object[][] params() { Object[][] params = new Object[][]{ - {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null - , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, - ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, - ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, - {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + {null, ChunkCompressionType.LZ4, 20, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 20, 1024}, + {null, ChunkCompressionType.SNAPPY, 20, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 20, 1024}, + {null, ChunkCompressionType.LZ4, 2048, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} }; for (int i = 0; i < _dirs.length; i++) { @@ -124,12 +147,17 @@ public void validateCompressionRatioIncrease() inputData.add(mvRow); } - int writerVersion = 5; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = _dirs[0]; + int rawIndexVersionV4 = 4; + File explicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV4)); FileUtils.deleteQuietly(explicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV4)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } From 6452c79f09f83192d671052bb41bb14cba552d16 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:02:01 +0800 Subject: [PATCH 021/101] Improved unit test --- .../index/creator/VarByteChunkV4Test.java | 21 +++++++++- .../index/creator/VarByteChunkV5Test.java | 38 +++---------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 70313d91e701..3b83a793fcb0 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,6 +33,7 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -88,9 +89,25 @@ public void deleteDirs() { } } + public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { + return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); + } + @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) - throws IOException { + throws IOException, RuntimeException { File stringSVFile = new File(file, "testStringSV"); testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, VarByteChunkForwardIndexWriterV4::putString, (reader, context, docId) -> reader.getString(docId, context)); @@ -193,7 +210,7 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { + protected Stream randomStrings(int count, int lengthOfLongestEntry) { return IntStream.range(0, count) .mapToObj(i -> { int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index 59e336d14384..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -22,16 +22,12 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Random; import java.util.UUID; -import java.util.concurrent.ThreadLocalRandom; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -164,10 +160,11 @@ public void validateCompressionRatioIncrease() } // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = _dirs[1]; + int rawIndexVersionV5 = 5; + File implicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV5)); FileUtils.deleteQuietly(implicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV5)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } @@ -177,18 +174,10 @@ public void validateCompressionRatioIncrease() // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - static class ByteSplitterMV implements Function { - @Override - public byte[][] apply(String input) { - List res = new ArrayList<>(); - for (int i = 0; i < input.length(); i += 3) { - int endIndex = Math.min(i + 3, input.length()); - res.add(input.substring(i, endIndex).getBytes()); - } - return res.toArray(new byte[0][]); - } + // Cleanup + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); } private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, @@ -233,21 +222,6 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { - return IntStream.range(0, count).mapToObj(i -> { - int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); - byte[] bytes = new byte[length]; - if (length != 0) { - bytes[bytes.length - 1] = 'c'; - if (length > 2) { - Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); - } - bytes[0] = 'a'; - } - return new String(bytes, StandardCharsets.UTF_8); - }); - } - @FunctionalInterface interface Read { T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); From 9bfbd2219e0974e866e3cc59757964cbce1f85ae Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:29:40 +0800 Subject: [PATCH 022/101] Refactored unit test --- .../index/creator/VarByteChunkV4Test.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 3b83a793fcb0..6c8eb6d9fe89 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,7 +33,6 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -89,22 +88,6 @@ public void deleteDirs() { } } - public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { - return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); - } - @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) throws IOException, RuntimeException { From fac46c55362182768085f6cc87df0fa414cb95b3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:40 +0800 Subject: [PATCH 023/101] Add blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index c5bbc75c2760..f80f764fe462 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 29a9fdb8b77021d0b58ea47372cbf0a80a619b54 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:59 +0800 Subject: [PATCH 024/101] Remove blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index f80f764fe462..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 8abf7fefbf56d8029ef9643d4dbfa227a01e6044 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:42:48 +0800 Subject: [PATCH 025/101] Add blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index 30b15c0c1a6e..f7404dc64bd8 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,6 +71,7 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long + * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From 1c877c6ba922f6c135958995b0d272595b30728a Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:43:01 +0800 Subject: [PATCH 026/101] Remove blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index f7404dc64bd8..30b15c0c1a6e 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,7 +71,6 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long - * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From ce6870bfb9e2b9d59dcb1d0d6a17d84acdc5be63 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Sat, 28 Sep 2024 04:52:16 +0800 Subject: [PATCH 027/101] Rebase with lastest master --- ...ultiValueFixedByteRawIndexCreatorTest.java | 118 ++++++++++-------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 33c8525a42d2..affbfd0b10ed 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,6 +54,8 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); + private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; + @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -78,77 +80,86 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, + boolean explicitMVEntrySize) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -158,6 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, maxElements, false, writerVersion, 1024 * 1024, 1000); + creator.setExplicitMVEntrySize(explicitMVEntrySize); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); From 0d71f91bd6e911b68343af6dcfcd760b5eeafaba Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:24:41 +0800 Subject: [PATCH 028/101] Fixed uncovered code paths exposed via unit test --- .../segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../impl/fwd/MultiValueFixedByteRawIndexCreator.java | 1 + .../forward/FixedByteChunkMVForwardIndexReader.java | 5 +++++ .../creator/MultiValueFixedByteRawIndexCreatorTest.java | 7 ++++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index c681265ffb9f..e15bb49d0ee5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index e7877f8b6064..4fafcbcaad5e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -69,6 +69,7 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } + public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index 8e53ecb15639..96299ffb32c0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,6 +42,11 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } + public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { + this(dataBuffer, storedType); + _explicitMVEntrySize = explicitMVEntrySize; + } + @Nullable @Override public ChunkReaderContext createContext() { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index affbfd0b10ed..2a5567efa464 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; @@ -168,8 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - creator.setExplicitMVEntrySize(explicitMVEntrySize); + maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -177,7 +177,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + dataType.getStoredType(), false, explicitMVEntrySize) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); From 736f23fbf93345c25a90e1f79446095ac7143ff8 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:31:50 +0800 Subject: [PATCH 029/101] Fix style issue --- .../index/creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 2a5567efa464..b32281f9302c 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; -import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; From f2faecefb165a38f2626e7b3d50f6f7beb4ca082 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:21:34 +0800 Subject: [PATCH 030/101] Refactored code to use new class versions. --- .../impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ++++++++++++++++ .../FixedByteChunkMVForwardIndexReaderV2.java | 45 +++++++ .../VarByteChunkForwardIndexReaderV5.java | 47 +++++++ ...ultiValueFixedByteRawIndexCreatorTest.java | 124 ++++++++---------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 52 ++++++++ 6 files changed, 318 insertions(+), 68 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index e15bb49d0ee5..c681265ffb9f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java new file mode 100644 index 000000000000..90bc9281d23a --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator.impl.fwd; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. + */ +public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { + /** + * Create a var-byte raw index creator for the given column + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression to use + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk + * @param writerVersion writer format version + * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when + * deriveNumDocsPerChunk is true + */ + public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, + int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, + int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, + deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, + int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + @Override + protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { + return maxNumberOfMultiValueElements * valueType.getStoredType().size(); + } + + @Override + public void putIntMV(int[] values) { + byte[] bytes = new byte[values.length * Integer.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (int value : values) { + byteBuffer.putInt(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putLongMV(long[] values) { + byte[] bytes = new byte[values.length * Long.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (long value : values) { + byteBuffer.putLong(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putFloatMV(float[] values) { + byte[] bytes = new byte[values.length * Float.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (float value : values) { + byteBuffer.putFloat(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putDoubleMV(double[] values) { + byte[] bytes = new byte[values.length * Double.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (double value : values) { + byteBuffer.putDouble(value); + } + _indexWriter.putBytes(bytes); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java new file mode 100644 index 000000000000..762672928d72 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred + */ +public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { + + public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { + super(dataBuffer, storedType); + } + + @Override + public int getNumValuesMV(int docId, ChunkReaderContext context) { + ByteBuffer byteBuffer = slice(docId, context); + return getNumValuesMV(byteBuffer); + } + + @Override + protected int getNumValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java new file mode 100644 index 000000000000..0a8a4527bebc --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + * Chunk-based raw (non-dictionary-encoded) forward index reader for values of SV variable length data types + * (BIG_DECIMAL, STRING, BYTES), MV fixed length and MV variable length data types. + *

For data layout, please refer to the documentation for {@link VarByteChunkForwardIndexWriterV4} + */ +public class VarByteChunkForwardIndexReaderV5 extends VarByteChunkForwardIndexReaderV4 { + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue) { + super(dataBuffer, storedType, isSingleValue); + } + + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue, boolean explicitMVEntrySize) { + super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); + } + + @Override + protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index b32281f9302c..c17c77a8a874 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,8 +54,6 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); - private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; - @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -80,86 +78,78 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, - boolean explicitMVEntrySize) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -168,7 +158,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); + maxElements, false, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -176,8 +166,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false, explicitMVEntrySize) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java new file mode 100644 index 000000000000..e4937a12ab3f --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.function.IntFunction; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.spi.V1Constants.Indexes; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + // TODO: +} From 01cbf5653fecb99e22895dc1d51cdcdb92b98724 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:25:29 +0800 Subject: [PATCH 031/101] Fixed style. --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index e4937a12ab3f..8a3e2e4fc65e 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,35 +18,6 @@ */ package org.apache.pinot.segment.local.segment.index.creator; -import java.io.File; -import java.io.IOException; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import java.util.function.IntFunction; -import java.util.function.ToIntFunction; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; -import org.apache.pinot.segment.spi.V1Constants.Indexes; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; -import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - - public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { // TODO: } From f8c2f244d66c3056704e75350a97827bd50e6825 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:32:21 +0800 Subject: [PATCH 032/101] Refactored MultiValueFixedByteRawIndexCreatorTest.java --- ...MultiValueFixedByteRawIndexCreatorTest.java | 11 +++++++---- ...ltiValueFixedByteRawIndexCreatorV2Test.java | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index c17c77a8a874..78d72b49e452 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -164,10 +164,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo //read final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); - ForwardIndexReader reader = - writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + ForwardIndexReader reader = getForwardIndexReader(buffer, dataType, writerVersion); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); @@ -189,6 +186,12 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 8a3e2e4fc65e..1c90b8092f6a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,6 +18,22 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - // TODO: + @Override + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + } } From a6ca3512090e00a854c2500c0e81eadf716a072b Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:36:46 +0800 Subject: [PATCH 033/101] Fix style. --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 1c90b8092f6a..c07257901009 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -31,9 +31,9 @@ */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } } From 61fee18271571016449c9b42808ceaa308df36d1 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 14:25:36 +0800 Subject: [PATCH 034/101] Modified existing unit test and extended it for MultiValueFixedByteRawIndexCreatorV2Test.java --- ...ultiValueFixedByteRawIndexCreatorTest.java | 31 +++++++++++-------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 27 ++++++++++++++-- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 78d72b49e452..8be27bf0fe09 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -49,8 +49,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { - private static final String OUTPUT_DIR = - System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + protected static String _outputDir; private static final Random RANDOM = new Random(); @@ -64,7 +63,8 @@ public Object[][] compressionTypes() { @BeforeClass public void setup() throws Exception { - FileUtils.forceMkdir(new File(OUTPUT_DIR)); + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + FileUtils.forceMkdir(new File(_outputDir)); } /** @@ -72,7 +72,7 @@ public void setup() */ @AfterClass public void cleanup() { - FileUtils.deleteQuietly(new File(OUTPUT_DIR)); + FileUtils.deleteQuietly(new File(_outputDir)); } @Test(dataProvider = "compressionTypes") @@ -148,17 +148,28 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion }, compressionType, writerVersion); } + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator(ChunkCompressionType compressionType, + String column, int numDocs, DataType dataType, int maxElements, int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreator(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); int maxElements = inputs.stream().mapToInt(sizeof).max().orElseThrow(RuntimeException::new); - File file = new File(OUTPUT_DIR, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); + File file = new File(_outputDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); file.delete(); MultiValueFixedByteRawIndexCreator creator = - new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); + getMultiValueFixedByteRawIndexCreator(compressionType, column, numDocs, dataType, maxElements, writerVersion); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -186,12 +197,6 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); - } - interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index c07257901009..eae997014e22 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,20 +18,43 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import java.io.File; +import java.io.IOException; +import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.annotations.BeforeClass; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + @BeforeClass + public void setup() + throws Exception { + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; + FileUtils.forceMkdir(new File(_outputDir)); + } + + @Override + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( + ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, + int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); From 500dae6c1ff869b132d2cbb11ec2f0eebfc8ac16 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 18:15:33 +0800 Subject: [PATCH 035/101] Improved unit test for MultiValueFixedByteRawIndexCreatorTest and MultiValueFixedByteRawIndexCreatorV2Test --- ...ultiValueFixedByteRawIndexCreatorTest.java | 2 +- ...tiValueFixedByteRawIndexCreatorV2Test.java | 63 ++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 8be27bf0fe09..1b4e4e9368a1 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -51,7 +51,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { protected static String _outputDir; - private static final Random RANDOM = new Random(); + protected static final Random RANDOM = new Random(); @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index eae997014e22..dd3d0b1156b8 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -20,6 +20,8 @@ import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -30,11 +32,15 @@ import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used + Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and + reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with + the forward index creator version upgrade. */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @BeforeClass @@ -59,4 +65,59 @@ public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpe return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + for (int writerVersion : List.of(2, 4)) { + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( + explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = + new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( + implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: + // 1. At least 15% larger than the implicit length variant when using Writer Version 2 + // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + long expectedImplicitLengthFwdIndexMaxSize; + if (writerVersion == 2) { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); + } else { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + } + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + } } From 427bdb5ef1f0ea48a0aa3b8db11923e66cb93aa5 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:17:42 +0800 Subject: [PATCH 036/101] Remove redundant blank line --- .../index/creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 1b4e4e9368a1..9a2105726aa7 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -139,7 +139,6 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion return Arrays.copyOf(buffer, length); }, compressionType, writerVersion); - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { From 0cb705d2e9e2395b158700a1a4ef4e0f181424d2 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:45:49 +0800 Subject: [PATCH 037/101] Adjusted comments content --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index dd3d0b1156b8..f7105630c5e3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -109,8 +109,8 @@ public void validateCompressionRatioIncrease() } // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 15% larger than the implicit length variant when using Writer Version 2 - // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 + // 2. At least 2x larger than the implicit length variant when using Writer Version 4 long expectedImplicitLengthFwdIndexMaxSize; if (writerVersion == 2) { expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); From 1617ebd566dafde8d1331cdfd66d163964de693d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 3 Oct 2024 16:22:53 +0800 Subject: [PATCH 038/101] Removed redundant constructor missed during refactoring. --- .../readers/forward/VarByteChunkForwardIndexReaderV5.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 0a8a4527bebc..20569bf7ff4e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -35,11 +35,6 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } - public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, - boolean isSingleValue, boolean explicitMVEntrySize) { - super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); - } - @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From 813d3601657470a6caec126cbed2cda61953dd4e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 05:51:53 +0800 Subject: [PATCH 039/101] Upgrade MVFixedByteRawIndex reader and writer from V4 to V5, retain forward index creator version. --- .../VarByteChunkForwardIndexWriterV4.java | 12 +- .../VarByteChunkForwardIndexWriterV5.java | 55 +++++++++ .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ------------------ .../forward/ForwardIndexReaderFactory.java | 7 ++ .../VarByteChunkForwardIndexReaderV4.java | 8 +- .../VarByteChunkForwardIndexReaderV5.java | 8 ++ 6 files changed, 85 insertions(+), 121 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index e7bc30fc7027..b9f1f876396c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -76,11 +76,13 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; - private static final Logger LOGGER = LoggerFactory.getLogger(VarByteChunkForwardIndexWriterV4.class); + // Use the run-time concrete class to retrieve the logger + protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + private static final String DATA_BUFFER_SUFFIX = ".buf"; private final File _dataBuffer; - private final RandomAccessFile _output; + protected final RandomAccessFile _output; private final FileChannel _dataChannel; private final ByteBuffer _chunkBuffer; private final ByteBuffer _compressionBuffer; @@ -105,11 +107,15 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress writeHeader(_chunkCompressor.compressionType(), chunkSize); } + public int getVersion() { + return VERSION; + } + private void writeHeader(ChunkCompressionType compressionType, int targetDecompressedChunkSize) throws IOException { // keep metadata BE for backwards compatibility // (e.g. the version needs to be read by a factory which assumes BE) - _output.writeInt(VERSION); + _output.writeInt(getVersion()); _output.writeInt(targetDecompressedChunkSize); _output.writeInt(compressionType.getValue()); // reserve a slot to write the data offset into diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java new file mode 100644 index 000000000000..72c94e210139 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.writer.impl; + +import java.io.File; +import java.io.IOException; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; + + +/** + * Forward index writer that extends {@link VarByteChunkForwardIndexWriterV4} with the only difference being the + * version tag is now bumped from 4 to 5. + * + *

The {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static variables + * are shadowed in the child class thus associated with the class that defines them, care must be taken to ensure + * that the parent class can correctly observe the child class's {@code VERSION} value at runtime.

+ * + *

To achieve this, the {@code getVersion()} method is overridden to return the concrete subclass's + * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference + * to the parent class.

+ * + * @see VarByteChunkForwardIndexWriterV4 + * @see VarByteChunkForwardIndexWriterV5#getVersion() + */ +@NotThreadSafe +public class VarByteChunkForwardIndexWriterV5 extends VarByteChunkForwardIndexWriterV4 { + public static final int VERSION = 5; + + public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compressionType, int chunkSize) + throws IOException { + super(file, compressionType, chunkSize); + } + + @Override + public int getVersion() { + return VERSION; + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java deleted file mode 100644 index 90bc9281d23a..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.creator.impl.fwd; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. - */ -public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { - /** - * Create a var-byte raw index creator for the given column - * - * @param baseIndexDir Index directory - * @param compressionType Type of compression to use - * @param column Name of column to index - * @param totalDocs Total number of documents to index - * @param valueType Type of the values - * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk - * @param writerVersion writer format version - * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when - * deriveNumDocsPerChunk is true - */ - public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, - int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, - int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, - deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, - int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - @Override - protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { - return maxNumberOfMultiValueElements * valueType.getStoredType().size(); - } - - @Override - public void putIntMV(int[] values) { - byte[] bytes = new byte[values.length * Integer.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (int value : values) { - byteBuffer.putInt(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putLongMV(long[] values) { - byte[] bytes = new byte[values.length * Long.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (long value : values) { - byteBuffer.putLong(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putFloatMV(float[] values) { - byte[] bytes = new byte[values.length * Float.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (float value : values) { - byteBuffer.putFloat(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putDoubleMV(double[] values) { - byte[] bytes = new byte[values.length * Double.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (double value : values) { - byteBuffer.putDouble(value); - } - _indexWriter.putBytes(bytes); - } -} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index db815761d9ea..a4344b61f0de 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,8 +19,10 @@ package org.apache.pinot.segment.local.segment.index.forward; +import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.CLPForwardIndexCreatorV1; import org.apache.pinot.segment.local.segment.index.readers.forward.CLPForwardIndexReaderV1; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBitMVEntryDictForwardIndexReader; @@ -30,6 +32,7 @@ import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBytePower2ChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkMVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.sorted.SortedIndexReaderImpl; @@ -106,6 +109,10 @@ public static ForwardIndexReader createRawIndexReader(PinotDataBuffer dataBuffer : new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType); } + if (version >= VarByteChunkForwardIndexWriterV5.VERSION) { + // V5 is the same as V4 except the multi-value docs have implicit value count rather than explicit + return new VarByteChunkForwardIndexReaderV5(dataBuffer, storedType, isSingleValue); + } if (version == VarByteChunkForwardIndexWriterV4.VERSION) { // V4 reader is common for sv var byte, mv fixed byte and mv var byte return new VarByteChunkForwardIndexReaderV4(dataBuffer, storedType, isSingleValue); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 277805d22c95..b8b007ec8dd1 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -67,8 +67,7 @@ public class VarByteChunkForwardIndexReaderV4 public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue) { - int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + validateIndexVersion(dataBuffer); _storedType = storedType; _targetDecompressedChunkSize = dataBuffer.getInt(4); _chunkCompressionType = ChunkCompressionType.valueOf(dataBuffer.getInt(8)); @@ -81,6 +80,11 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da _isSingleValue = isSingleValue; } + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + } + @Override public boolean isDictionaryEncoded() { return false; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 20569bf7ff4e..fd4528c27df7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -18,8 +18,10 @@ */ package org.apache.pinot.segment.local.segment.index.readers.forward; +import com.google.common.base.Preconditions; import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; @@ -35,6 +37,12 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } + @Override + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.VERSION, "Illegal index version: %s", version); + } + @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From 7a565e4401d37ff14c44644e7f45d6f6c74e9244 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 06:08:41 +0800 Subject: [PATCH 040/101] Fix minor style issue. --- .../io/writer/impl/VarByteChunkForwardIndexWriterV4.java | 4 ++-- .../segment/index/forward/ForwardIndexReaderFactory.java | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index b9f1f876396c..f94fd64cee75 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -77,7 +77,7 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; // Use the run-time concrete class to retrieve the logger - protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + protected final Logger _logger = LoggerFactory.getLogger(this.getClass()); private static final String DATA_BUFFER_SUFFIX = ".buf"; @@ -276,7 +276,7 @@ private void write(ByteBuffer buffer, boolean huge) { _chunkOffset += compressedSize; _docIdOffset = _nextDocId; } catch (IOException e) { - LOGGER.error("Exception caught while compressing/writing data chunk", e); + _logger.error("Exception caught while compressing/writing data chunk", e); throw new RuntimeException(e); } finally { if (mapped != null) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index a4344b61f0de..fdf1cfd1b96a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,7 +19,6 @@ package org.apache.pinot.segment.local.segment.index.forward; -import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; From 296959d33157a9064c9f3ec98e83076bad8a0112 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:35:23 +0800 Subject: [PATCH 041/101] Deleted FixByteChunkMVForwardIndexReaderV2 --- .../FixedByteChunkMVForwardIndexReaderV2.java | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java deleted file mode 100644 index 762672928d72..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.readers.forward; - -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred - */ -public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { - - public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { - super(dataBuffer, storedType); - } - - @Override - public int getNumValuesMV(int docId, ChunkReaderContext context) { - ByteBuffer byteBuffer = slice(docId, context); - return getNumValuesMV(byteBuffer); - } - - @Override - protected int getNumValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); - } -} From 0d7073b86810ca56140320aa33caffbc23f95f12 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:44:48 +0800 Subject: [PATCH 042/101] Deleted FixByteChunkMVForwardIndexReaderV2Test --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java deleted file mode 100644 index f7105630c5e3..000000000000 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.creator; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -/** - Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and - reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with - the forward index creator version upgrade. - */ -public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - @BeforeClass - public void setup() - throws Exception { - _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; - FileUtils.forceMkdir(new File(_outputDir)); - } - - @Override - public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( - ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, - int writerVersion) - throws IOException { - return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - } - - @Override - public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, - int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); - } - - @Test - public void validateCompressionRatioIncrease() - throws IOException { - // Generate input data containing short MV docs with somewhat repetitive data - int numDocs = 1000000; - int numElements = 0; - int maxMVRowSize = 0; - List inputData = new ArrayList<>(numDocs); - for (int i = 0; i < numDocs; i++) { - long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; - maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); - numElements += mvRow.length; - for (int j = 0; j < mvRow.length; j++, numElements++) { - mvRow[j] = numElements % 10; - } - inputData.add(mvRow); - } - - for (int writerVersion : List.of(2, 4)) { - // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); - FileUtils.deleteQuietly(explicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( - explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = - new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); - FileUtils.deleteQuietly(implicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( - implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 - // 2. At least 2x larger than the implicit length variant when using Writer Version 4 - long expectedImplicitLengthFwdIndexMaxSize; - if (writerVersion == 2) { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); - } else { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); - } - Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - } -} From 427af58e60e711a3a3b126cdf15da6d11fc4f772 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:22:15 +0800 Subject: [PATCH 043/101] Add VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java new file mode 100644 index 000000000000..bcc71b94dbad --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -0,0 +1,227 @@ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class VarByteChunkV5Test extends VarByteChunkV4Test { + private static final Random RANDOM = new Random(); + private static File[] _dirs; + + @DataProvider(parallel = true) + public Object[][] params() { + Object[][] params = new Object[][]{ + {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null + , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, + ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, + ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + }; + + for (int i = 0; i < _dirs.length; i++) { + params[i][0] = _dirs[i]; + } + + return params; + } + + @BeforeClass + public void forceMkDirs() + throws IOException { + _dirs = new File[10]; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + } + + @AfterClass + public void deleteDirs() { + for (File dir : _dirs) { + FileUtils.deleteQuietly(dir); + } + } + + @Test(dataProvider = "params") + public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringSVFile = new File(file, "testStringSV"); + testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, + VarByteChunkForwardIndexWriterV5::putString, (reader, context, docId) -> reader.getString(docId, context)); + FileUtils.deleteQuietly(stringSVFile); + } + + @Test(dataProvider = "params") + public void testBytesSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesSVFile = new File(file, "testBytesSV"); + testWriteRead(bytesSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, + x -> x.getBytes(StandardCharsets.UTF_8), VarByteChunkForwardIndexWriterV5::putBytes, + (reader, context, docId) -> reader.getBytes(docId, context)); + FileUtils.deleteQuietly(bytesSVFile); + } + + @Test(dataProvider = "params") + public void testStringMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringMVFile = new File(file, "testStringMV"); + testWriteRead(stringMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, + new StringSplitterMV(), VarByteChunkForwardIndexWriterV5::putStringMV, + (reader, context, docId) -> reader.getStringMV(docId, context)); + FileUtils.deleteQuietly(stringMVFile); + } + + @Test(dataProvider = "params") + public void testBytesMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesMVFile = new File(file, "testBytesMV"); + testWriteRead(bytesMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, new ByteSplitterMV(), + VarByteChunkForwardIndexWriterV5::putBytesMV, (reader, context, docId) -> reader.getBytesMV(docId, context)); + FileUtils.deleteQuietly(bytesMVFile); + } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + int writerVersion = 5; + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = _dirs[0]; + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = _dirs[1]; + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be at least + // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 + long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + + static class ByteSplitterMV implements Function { + @Override + public byte[][] apply(String input) { + List res = new ArrayList<>(); + for (int i = 0; i < input.length(); i += 3) { + int endIndex = Math.min(i + 3, input.length()); + res.add(input.substring(i, endIndex).getBytes()); + } + return res.toArray(new byte[0][]); + } + } + + private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, + FieldSpec.DataType dataType, Function forwardMapper, + BiConsumer write, Read read) + throws IOException { + List values = randomStrings(1000, longestEntry).map(forwardMapper).collect(Collectors.toList()); + try (VarByteChunkForwardIndexWriterV5 writer = new VarByteChunkForwardIndexWriterV5(file, compressionType, + chunkSize)) { + for (T value : values) { + write.accept(writer, value); + } + } + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(file)) { + try (VarByteChunkForwardIndexReaderV5 reader = new VarByteChunkForwardIndexReaderV5(buffer, dataType, true); + VarByteChunkForwardIndexReaderV5.ReaderContext context = reader.createContext()) { + for (int i = 0; i < values.size(); i++) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 0; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i--) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 2; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + } + } + } + + private Stream randomStrings(int count, int lengthOfLongestEntry) { + return IntStream.range(0, count).mapToObj(i -> { + int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); + byte[] bytes = new byte[length]; + if (length != 0) { + bytes[bytes.length - 1] = 'c'; + if (length > 2) { + Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); + } + bytes[0] = 'a'; + } + return new String(bytes, StandardCharsets.UTF_8); + }); + } + + @FunctionalInterface + interface Read { + T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); + } +} From 6e8c3ae9a0eb07b2445a7fc67d776ea2431115ee Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:56:48 +0800 Subject: [PATCH 044/101] Add license to VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index bcc71b94dbad..59e336d14384 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; @@ -37,11 +55,16 @@ public class VarByteChunkV5Test extends VarByteChunkV4Test { @DataProvider(parallel = true) public Object[][] params() { Object[][] params = new Object[][]{ - {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null - , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, - ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, - ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, - {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + {null, ChunkCompressionType.LZ4, 20, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 20, 1024}, + {null, ChunkCompressionType.SNAPPY, 20, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 20, 1024}, + {null, ChunkCompressionType.LZ4, 2048, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} }; for (int i = 0; i < _dirs.length; i++) { @@ -124,12 +147,17 @@ public void validateCompressionRatioIncrease() inputData.add(mvRow); } - int writerVersion = 5; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = _dirs[0]; + int rawIndexVersionV4 = 4; + File explicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV4)); FileUtils.deleteQuietly(explicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV4)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } From 52976a770576469d6bafda3004953f73de7f0bcd Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:02:01 +0800 Subject: [PATCH 045/101] Improved unit test --- .../index/creator/VarByteChunkV4Test.java | 21 +++++++++- .../index/creator/VarByteChunkV5Test.java | 38 +++---------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 70313d91e701..3b83a793fcb0 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,6 +33,7 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -88,9 +89,25 @@ public void deleteDirs() { } } + public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { + return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); + } + @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) - throws IOException { + throws IOException, RuntimeException { File stringSVFile = new File(file, "testStringSV"); testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, VarByteChunkForwardIndexWriterV4::putString, (reader, context, docId) -> reader.getString(docId, context)); @@ -193,7 +210,7 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { + protected Stream randomStrings(int count, int lengthOfLongestEntry) { return IntStream.range(0, count) .mapToObj(i -> { int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index 59e336d14384..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -22,16 +22,12 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Random; import java.util.UUID; -import java.util.concurrent.ThreadLocalRandom; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -164,10 +160,11 @@ public void validateCompressionRatioIncrease() } // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = _dirs[1]; + int rawIndexVersionV5 = 5; + File implicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV5)); FileUtils.deleteQuietly(implicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV5)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } @@ -177,18 +174,10 @@ public void validateCompressionRatioIncrease() // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - static class ByteSplitterMV implements Function { - @Override - public byte[][] apply(String input) { - List res = new ArrayList<>(); - for (int i = 0; i < input.length(); i += 3) { - int endIndex = Math.min(i + 3, input.length()); - res.add(input.substring(i, endIndex).getBytes()); - } - return res.toArray(new byte[0][]); - } + // Cleanup + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); } private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, @@ -233,21 +222,6 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { - return IntStream.range(0, count).mapToObj(i -> { - int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); - byte[] bytes = new byte[length]; - if (length != 0) { - bytes[bytes.length - 1] = 'c'; - if (length > 2) { - Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); - } - bytes[0] = 'a'; - } - return new String(bytes, StandardCharsets.UTF_8); - }); - } - @FunctionalInterface interface Read { T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); From 9bb453a9af9fd1c9b88733a0ebf182f2e82ef3ff Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:29:40 +0800 Subject: [PATCH 046/101] Refactored unit test --- .../index/creator/VarByteChunkV4Test.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 3b83a793fcb0..6c8eb6d9fe89 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,7 +33,6 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -89,22 +88,6 @@ public void deleteDirs() { } } - public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { - return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); - } - @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) throws IOException, RuntimeException { From 93d31004d8acbf27192f3416c12c2f67e6955a22 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:40 +0800 Subject: [PATCH 047/101] Add blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index c5bbc75c2760..f80f764fe462 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 79e91e9c6b08410ecf99157925b98e924b6145d2 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:59 +0800 Subject: [PATCH 048/101] Remove blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index f80f764fe462..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 9a7676f47079bf6b56858d7c8eb99c60cc554b6f Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:42:48 +0800 Subject: [PATCH 049/101] Add blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index 30b15c0c1a6e..f7404dc64bd8 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,6 +71,7 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long + * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From aa9eb7455fd305b9d8f7a85b7969fe66c911984a Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:43:01 +0800 Subject: [PATCH 050/101] Remove blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index f7404dc64bd8..30b15c0c1a6e 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,7 +71,6 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long - * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From 4ce5280320b89fab71442675d37ff7742c952fad Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Tue, 15 Oct 2024 05:18:27 +0800 Subject: [PATCH 051/101] Refactored code to utilize changes from Extract common MV ser/de logic into ArraySerDeUtils #14209 --- .../VarByteChunkForwardIndexWriterV5.java | 21 ++++++++++ .../MultiValueFixedByteRawIndexCreator.java | 22 +++++++--- .../forward/ForwardIndexReaderFactory.java | 5 +-- .../FixedByteChunkMVForwardIndexReader.java | 5 --- .../VarByteChunkForwardIndexReaderV5.java | 41 +++++++++++++++++-- 5 files changed, 77 insertions(+), 17 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index 72c94e210139..968891089b8f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -52,4 +53,24 @@ public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compress public int getVersion() { return VERSION; } + + @Override + public void putIntMV(int[] values) { + putBytes(ArraySerDeUtils.serializeIntArrayWithoutLength(values)); + } + + @Override + public void putLongMV(long[] values) { + putBytes(ArraySerDeUtils.serializeLongArrayWithoutLength(values)); + } + + @Override + public void putFloatMV(float[] values) { + putBytes(ArraySerDeUtils.serializeFloatArrayWithoutLength(values)); + } + + @Override + public void putDoubleMV(double[] values) { + putBytes(ArraySerDeUtils.serializeDoubleArrayWithoutLength(values)); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 4fafcbcaad5e..b8a6bd6daafd 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriter; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkWriter; import org.apache.pinot.segment.spi.V1Constants.Indexes; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -69,23 +70,32 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } - public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { - // Store the length followed by the values - int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { + // Store the length followed by the values + int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength + VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE), 1) : targetDocsPerChunk; _indexWriter = new VarByteChunkForwardIndexWriter(indexFile, compressionType, totalDocs, numDocsPerChunk, totalMaxLength, writerVersion); } else { - int chunkSize = - ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); - _indexWriter = new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize); + if (writerVersion == VarByteChunkForwardIndexWriterV5.VERSION) { + // Store only the values + int totalMaxLength = maxNumberOfMultiValueElements * valueType.getStoredType().size(); + int chunkSize = + ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); + _indexWriter = new VarByteChunkForwardIndexWriterV5(indexFile, compressionType, chunkSize); + } else { + // Store the length followed by the values + int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); + int chunkSize = + ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); + _indexWriter = new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize); + } } _valueType = valueType; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index fdf1cfd1b96a..cc7201ed985f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -108,11 +108,10 @@ public static ForwardIndexReader createRawIndexReader(PinotDataBuffer dataBuffer : new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType); } - if (version >= VarByteChunkForwardIndexWriterV5.VERSION) { + if (version == VarByteChunkForwardIndexWriterV5.VERSION) { // V5 is the same as V4 except the multi-value docs have implicit value count rather than explicit return new VarByteChunkForwardIndexReaderV5(dataBuffer, storedType, isSingleValue); - } - if (version == VarByteChunkForwardIndexWriterV4.VERSION) { + } else if (version == VarByteChunkForwardIndexWriterV4.VERSION) { // V4 reader is common for sv var byte, mv fixed byte and mv var byte return new VarByteChunkForwardIndexReaderV4(dataBuffer, storedType, isSingleValue); } else { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index 96299ffb32c0..8e53ecb15639 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,11 +42,6 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } - public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { - this(dataBuffer, storedType); - _explicitMVEntrySize = explicitMVEntrySize; - } - @Nullable @Override public ChunkReaderContext createContext() { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index fd4528c27df7..3221479c9f91 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -19,9 +19,9 @@ package org.apache.pinot.segment.local.segment.index.readers.forward; import com.google.common.base.Preconditions; -import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; @@ -44,7 +44,42 @@ public void validateIndexVersion(PinotDataBuffer dataBuffer) { } @Override - protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); + public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeIntArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeIntArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeLongArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeLongArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeFloatArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeFloatArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeDoubleArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public double[] getDoubleMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeDoubleArrayWithoutLength(context.getValue(docId)); } } From ec1b6283b1c383755f851259aedddca40ced4ce3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Tue, 15 Oct 2024 05:50:31 +0800 Subject: [PATCH 052/101] Removed redundant RuntimeException from method signature --- .../segment/local/segment/index/creator/VarByteChunkV4Test.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 6c8eb6d9fe89..8387c93dc838 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -90,7 +90,7 @@ public void deleteDirs() { @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) - throws IOException, RuntimeException { + throws IOException { File stringSVFile = new File(file, "testStringSV"); testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, VarByteChunkForwardIndexWriterV4::putString, (reader, context, docId) -> reader.getString(docId, context)); From e7f645c2880ad082a90972ae35266486f9e58532 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Sat, 28 Sep 2024 04:52:16 +0800 Subject: [PATCH 053/101] Rebase with lastest master --- ...ultiValueFixedByteRawIndexCreatorTest.java | 118 ++++++++++-------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 33c8525a42d2..affbfd0b10ed 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,6 +54,8 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); + private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; + @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -78,77 +80,86 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion); + for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion, explicitMVEntrySizeOption); + } } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, + boolean explicitMVEntrySize) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -158,6 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, maxElements, false, writerVersion, 1024 * 1024, 1000); + creator.setExplicitMVEntrySize(explicitMVEntrySize); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); From 5cb4575bc4b22dc6a4e052a8b4adde2eea8bcae5 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:24:41 +0800 Subject: [PATCH 054/101] Fixed uncovered code paths exposed via unit test --- .../segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../impl/fwd/MultiValueFixedByteRawIndexCreator.java | 1 + .../forward/FixedByteChunkMVForwardIndexReader.java | 5 +++++ .../creator/MultiValueFixedByteRawIndexCreatorTest.java | 7 ++++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index c681265ffb9f..e15bb49d0ee5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index e7877f8b6064..4fafcbcaad5e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -69,6 +69,7 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } + public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index 8e53ecb15639..96299ffb32c0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,6 +42,11 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } + public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { + this(dataBuffer, storedType); + _explicitMVEntrySize = explicitMVEntrySize; + } + @Nullable @Override public ChunkReaderContext createContext() { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index affbfd0b10ed..2a5567efa464 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; @@ -168,8 +169,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - creator.setExplicitMVEntrySize(explicitMVEntrySize); + maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -177,7 +177,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + dataType.getStoredType(), false, explicitMVEntrySize) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); From 1ece331f71cd324619caf3aa4d13f52a08c5701e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:31:50 +0800 Subject: [PATCH 055/101] Fix style issue --- .../index/creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 2a5567efa464..b32281f9302c 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; -import java.nio.BufferUnderflowException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Arrays; From 4c35683d0ec542dcf660b1a7da14d546a570aed1 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:21:34 +0800 Subject: [PATCH 056/101] Refactored code to use new class versions. --- .../impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ++++++++++++++++ .../FixedByteChunkMVForwardIndexReaderV2.java | 45 +++++++ .../VarByteChunkForwardIndexReaderV5.java | 47 +++++++ ...ultiValueFixedByteRawIndexCreatorTest.java | 124 ++++++++---------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 52 ++++++++ 6 files changed, 318 insertions(+), 68 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index e15bb49d0ee5..c681265ffb9f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java new file mode 100644 index 000000000000..90bc9281d23a --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator.impl.fwd; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. + */ +public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { + /** + * Create a var-byte raw index creator for the given column + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression to use + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk + * @param writerVersion writer format version + * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when + * deriveNumDocsPerChunk is true + */ + public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, + int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, + int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, + deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, + int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + @Override + protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { + return maxNumberOfMultiValueElements * valueType.getStoredType().size(); + } + + @Override + public void putIntMV(int[] values) { + byte[] bytes = new byte[values.length * Integer.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (int value : values) { + byteBuffer.putInt(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putLongMV(long[] values) { + byte[] bytes = new byte[values.length * Long.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (long value : values) { + byteBuffer.putLong(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putFloatMV(float[] values) { + byte[] bytes = new byte[values.length * Float.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (float value : values) { + byteBuffer.putFloat(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putDoubleMV(double[] values) { + byte[] bytes = new byte[values.length * Double.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (double value : values) { + byteBuffer.putDouble(value); + } + _indexWriter.putBytes(bytes); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java new file mode 100644 index 000000000000..762672928d72 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred + */ +public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { + + public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { + super(dataBuffer, storedType); + } + + @Override + public int getNumValuesMV(int docId, ChunkReaderContext context) { + ByteBuffer byteBuffer = slice(docId, context); + return getNumValuesMV(byteBuffer); + } + + @Override + protected int getNumValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java new file mode 100644 index 000000000000..0a8a4527bebc --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + * Chunk-based raw (non-dictionary-encoded) forward index reader for values of SV variable length data types + * (BIG_DECIMAL, STRING, BYTES), MV fixed length and MV variable length data types. + *

For data layout, please refer to the documentation for {@link VarByteChunkForwardIndexWriterV4} + */ +public class VarByteChunkForwardIndexReaderV5 extends VarByteChunkForwardIndexReaderV4 { + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue) { + super(dataBuffer, storedType, isSingleValue); + } + + public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, + boolean isSingleValue, boolean explicitMVEntrySize) { + super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); + } + + @Override + protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index b32281f9302c..c17c77a8a874 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -54,8 +54,6 @@ public class MultiValueFixedByteRawIndexCreatorTest { private static final Random RANDOM = new Random(); - private static final boolean[] EXPLICIT_MV_ENTRY_SIZE_OPTIONS = {true, false}; - @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { return Arrays.stream(ChunkCompressionType.values()) @@ -80,86 +78,78 @@ public void cleanup() { @Test(dataProvider = "compressionTypes") public void testMVInt(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, - (reader, context, docId, buffer) -> { - int length = reader.getIntMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.INT, ints(false), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.INT, ints(true), x -> x.length, int[]::new, MultiValueFixedByteRawIndexCreator::putIntMV, + (reader, context, docId, buffer) -> { + int length = reader.getIntMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVLong(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, - (reader, context, docId, buffer) -> { - int length = reader.getLongMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.LONG, longs(false), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.LONG, longs(true), x -> x.length, long[]::new, MultiValueFixedByteRawIndexCreator::putLongMV, + (reader, context, docId, buffer) -> { + int length = reader.getLongMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVFloat(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, - (reader, context, docId, buffer) -> { - int length = reader.getFloatMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.FLOAT, floats(false), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.FLOAT, floats(true), x -> x.length, float[]::new, MultiValueFixedByteRawIndexCreator::putFloatMV, + (reader, context, docId, buffer) -> { + int length = reader.getFloatMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } @Test(dataProvider = "compressionTypes") public void testMVDouble(ChunkCompressionType compressionType, int writerVersion) throws IOException { - for (boolean explicitMVEntrySizeOption : EXPLICIT_MV_ENTRY_SIZE_OPTIONS) { - // This tests varying lengths of MV rows - testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk - testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, - MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { - int length = reader.getDoubleMV(docId, buffer, context); - return Arrays.copyOf(buffer, length); - }, compressionType, writerVersion, explicitMVEntrySizeOption); - } + // This tests varying lengths of MV rows + testMV(DataType.DOUBLE, doubles(false), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); + + + // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk + testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, + MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { + int length = reader.getDoubleMV(docId, buffer, context); + return Arrays.copyOf(buffer, length); + }, compressionType, writerVersion); } public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, - Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion, - boolean explicitMVEntrySize) + Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); @@ -168,7 +158,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo file.delete(); MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, explicitMVEntrySize, writerVersion, 1024 * 1024, 1000); + maxElements, false, writerVersion, 1024 * 1024, 1000); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -176,8 +166,8 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); ForwardIndexReader reader = writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false, explicitMVEntrySize) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType(), explicitMVEntrySize); + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java new file mode 100644 index 000000000000..e4937a12ab3f --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.function.IntFunction; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.spi.V1Constants.Indexes; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + // TODO: +} From bd1da13ef2bb931cb1d52648078f22392957fa14 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:25:29 +0800 Subject: [PATCH 057/101] Fixed style. --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index e4937a12ab3f..8a3e2e4fc65e 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,35 +18,6 @@ */ package org.apache.pinot.segment.local.segment.index.creator; -import java.io.File; -import java.io.IOException; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import java.util.function.IntFunction; -import java.util.function.ToIntFunction; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; -import org.apache.pinot.segment.spi.V1Constants.Indexes; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; -import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - - public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { // TODO: } From 2637e2def1c4bc925fe9b24d22da34743a3e32ce Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:32:21 +0800 Subject: [PATCH 058/101] Refactored MultiValueFixedByteRawIndexCreatorTest.java --- ...MultiValueFixedByteRawIndexCreatorTest.java | 11 +++++++---- ...ltiValueFixedByteRawIndexCreatorV2Test.java | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index c17c77a8a874..78d72b49e452 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -164,10 +164,7 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo //read final PinotDataBuffer buffer = PinotDataBuffer.mapFile(file, true, 0, file.length(), ByteOrder.BIG_ENDIAN, ""); - ForwardIndexReader reader = - writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + ForwardIndexReader reader = getForwardIndexReader(buffer, dataType, writerVersion); final ForwardIndexReaderContext context = reader.createContext(); T valueBuffer = constructor.apply(maxElements); @@ -189,6 +186,12 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 8a3e2e4fc65e..1c90b8092f6a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,6 +18,22 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - // TODO: + @Override + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + } } From 731906e470693b3b25087fe4292c3271c447903b Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:36:46 +0800 Subject: [PATCH 059/101] Fix style. --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 1c90b8092f6a..c07257901009 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -31,9 +31,9 @@ */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } } From 171aaf4ae926f6e05929c5c45de408cf10b0815a Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 14:25:36 +0800 Subject: [PATCH 060/101] Modified existing unit test and extended it for MultiValueFixedByteRawIndexCreatorV2Test.java --- ...ultiValueFixedByteRawIndexCreatorTest.java | 31 +++++++++++-------- ...tiValueFixedByteRawIndexCreatorV2Test.java | 27 ++++++++++++++-- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 78d72b49e452..8be27bf0fe09 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -49,8 +49,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { - private static final String OUTPUT_DIR = - System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + protected static String _outputDir; private static final Random RANDOM = new Random(); @@ -64,7 +63,8 @@ public Object[][] compressionTypes() { @BeforeClass public void setup() throws Exception { - FileUtils.forceMkdir(new File(OUTPUT_DIR)); + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawTest"; + FileUtils.forceMkdir(new File(_outputDir)); } /** @@ -72,7 +72,7 @@ public void setup() */ @AfterClass public void cleanup() { - FileUtils.deleteQuietly(new File(OUTPUT_DIR)); + FileUtils.deleteQuietly(new File(_outputDir)); } @Test(dataProvider = "compressionTypes") @@ -148,17 +148,28 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion }, compressionType, writerVersion); } + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator(ChunkCompressionType compressionType, + String column, int numDocs, DataType dataType, int maxElements, int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreator(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + public void testMV(DataType dataType, List inputs, ToIntFunction sizeof, IntFunction constructor, Injector injector, Extractor extractor, ChunkCompressionType compressionType, int writerVersion) throws IOException { String column = "testCol_" + dataType; int numDocs = inputs.size(); int maxElements = inputs.stream().mapToInt(sizeof).max().orElseThrow(RuntimeException::new); - File file = new File(OUTPUT_DIR, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); + File file = new File(_outputDir, column + Indexes.RAW_MV_FORWARD_INDEX_FILE_EXTENSION); file.delete(); MultiValueFixedByteRawIndexCreator creator = - new MultiValueFixedByteRawIndexCreator(new File(OUTPUT_DIR), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); + getMultiValueFixedByteRawIndexCreator(compressionType, column, numDocs, dataType, maxElements, writerVersion); inputs.forEach(input -> injector.inject(creator, input)); creator.close(); @@ -186,12 +197,6 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); - } - interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index c07257901009..eae997014e22 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,20 +18,43 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import java.io.File; +import java.io.IOException; +import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.annotations.BeforeClass; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + @BeforeClass + public void setup() + throws Exception { + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; + FileUtils.forceMkdir(new File(_outputDir)); + } + + @Override + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( + ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, + int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); From bca5edaa76635880154e81b0fd2d041d6ac75aff Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 18:15:33 +0800 Subject: [PATCH 061/101] Improved unit test for MultiValueFixedByteRawIndexCreatorTest and MultiValueFixedByteRawIndexCreatorV2Test --- ...ultiValueFixedByteRawIndexCreatorTest.java | 2 +- ...tiValueFixedByteRawIndexCreatorV2Test.java | 63 ++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 8be27bf0fe09..1b4e4e9368a1 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -51,7 +51,7 @@ public class MultiValueFixedByteRawIndexCreatorTest { protected static String _outputDir; - private static final Random RANDOM = new Random(); + protected static final Random RANDOM = new Random(); @DataProvider(name = "compressionTypes") public Object[][] compressionTypes() { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index eae997014e22..dd3d0b1156b8 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -20,6 +20,8 @@ import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -30,11 +32,15 @@ import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used + Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and + reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with + the forward index creator version upgrade. */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @BeforeClass @@ -59,4 +65,59 @@ public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpe return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + for (int writerVersion : List.of(2, 4)) { + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( + explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = + new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( + implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: + // 1. At least 15% larger than the implicit length variant when using Writer Version 2 + // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + long expectedImplicitLengthFwdIndexMaxSize; + if (writerVersion == 2) { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); + } else { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + } + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + } } From ce5eb7b559c56ae7c47e9c182bafcb8bb038f69d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:17:42 +0800 Subject: [PATCH 062/101] Remove redundant blank line --- .../index/creator/MultiValueFixedByteRawIndexCreatorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 1b4e4e9368a1..9a2105726aa7 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -139,7 +139,6 @@ public void testMVDouble(ChunkCompressionType compressionType, int writerVersion return Arrays.copyOf(buffer, length); }, compressionType, writerVersion); - // This tests a fixed length of MV rows to ensure there are no BufferOverflowExceptions on filling up the chunk testMV(DataType.DOUBLE, doubles(true), x -> x.length, double[]::new, MultiValueFixedByteRawIndexCreator::putDoubleMV, (reader, context, docId, buffer) -> { From 7274f4ce0589e741147235157ad0e98b296afbe4 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:45:49 +0800 Subject: [PATCH 063/101] Adjusted comments content --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index dd3d0b1156b8..f7105630c5e3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -109,8 +109,8 @@ public void validateCompressionRatioIncrease() } // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 15% larger than the implicit length variant when using Writer Version 2 - // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 + // 2. At least 2x larger than the implicit length variant when using Writer Version 4 long expectedImplicitLengthFwdIndexMaxSize; if (writerVersion == 2) { expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); From ea29a135a9c3eba94ebf65e722c7193d136e82b5 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 3 Oct 2024 16:22:53 +0800 Subject: [PATCH 064/101] Removed redundant constructor missed during refactoring. --- .../readers/forward/VarByteChunkForwardIndexReaderV5.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 0a8a4527bebc..20569bf7ff4e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -35,11 +35,6 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } - public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, - boolean isSingleValue, boolean explicitMVEntrySize) { - super(dataBuffer, storedType, isSingleValue, explicitMVEntrySize); - } - @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From 256d774512b887e1fe84d7eaf6f25125190d0795 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 05:51:53 +0800 Subject: [PATCH 065/101] Upgrade MVFixedByteRawIndex reader and writer from V4 to V5, retain forward index creator version. --- .../VarByteChunkForwardIndexWriterV4.java | 12 +- .../VarByteChunkForwardIndexWriterV5.java | 55 +++++++++ .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ------------------ .../forward/ForwardIndexReaderFactory.java | 7 ++ .../VarByteChunkForwardIndexReaderV4.java | 8 +- .../VarByteChunkForwardIndexReaderV5.java | 8 ++ 6 files changed, 85 insertions(+), 121 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index e7bc30fc7027..b9f1f876396c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -76,11 +76,13 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; - private static final Logger LOGGER = LoggerFactory.getLogger(VarByteChunkForwardIndexWriterV4.class); + // Use the run-time concrete class to retrieve the logger + protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + private static final String DATA_BUFFER_SUFFIX = ".buf"; private final File _dataBuffer; - private final RandomAccessFile _output; + protected final RandomAccessFile _output; private final FileChannel _dataChannel; private final ByteBuffer _chunkBuffer; private final ByteBuffer _compressionBuffer; @@ -105,11 +107,15 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress writeHeader(_chunkCompressor.compressionType(), chunkSize); } + public int getVersion() { + return VERSION; + } + private void writeHeader(ChunkCompressionType compressionType, int targetDecompressedChunkSize) throws IOException { // keep metadata BE for backwards compatibility // (e.g. the version needs to be read by a factory which assumes BE) - _output.writeInt(VERSION); + _output.writeInt(getVersion()); _output.writeInt(targetDecompressedChunkSize); _output.writeInt(compressionType.getValue()); // reserve a slot to write the data offset into diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java new file mode 100644 index 000000000000..72c94e210139 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.writer.impl; + +import java.io.File; +import java.io.IOException; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; + + +/** + * Forward index writer that extends {@link VarByteChunkForwardIndexWriterV4} with the only difference being the + * version tag is now bumped from 4 to 5. + * + *

The {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static variables + * are shadowed in the child class thus associated with the class that defines them, care must be taken to ensure + * that the parent class can correctly observe the child class's {@code VERSION} value at runtime.

+ * + *

To achieve this, the {@code getVersion()} method is overridden to return the concrete subclass's + * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference + * to the parent class.

+ * + * @see VarByteChunkForwardIndexWriterV4 + * @see VarByteChunkForwardIndexWriterV5#getVersion() + */ +@NotThreadSafe +public class VarByteChunkForwardIndexWriterV5 extends VarByteChunkForwardIndexWriterV4 { + public static final int VERSION = 5; + + public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compressionType, int chunkSize) + throws IOException { + super(file, compressionType, chunkSize); + } + + @Override + public int getVersion() { + return VERSION; + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java deleted file mode 100644 index 90bc9281d23a..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.creator.impl.fwd; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. - */ -public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { - /** - * Create a var-byte raw index creator for the given column - * - * @param baseIndexDir Index directory - * @param compressionType Type of compression to use - * @param column Name of column to index - * @param totalDocs Total number of documents to index - * @param valueType Type of the values - * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk - * @param writerVersion writer format version - * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when - * deriveNumDocsPerChunk is true - */ - public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, - int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, - int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, - deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, - int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - @Override - protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { - return maxNumberOfMultiValueElements * valueType.getStoredType().size(); - } - - @Override - public void putIntMV(int[] values) { - byte[] bytes = new byte[values.length * Integer.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (int value : values) { - byteBuffer.putInt(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putLongMV(long[] values) { - byte[] bytes = new byte[values.length * Long.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (long value : values) { - byteBuffer.putLong(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putFloatMV(float[] values) { - byte[] bytes = new byte[values.length * Float.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (float value : values) { - byteBuffer.putFloat(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putDoubleMV(double[] values) { - byte[] bytes = new byte[values.length * Double.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (double value : values) { - byteBuffer.putDouble(value); - } - _indexWriter.putBytes(bytes); - } -} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index db815761d9ea..a4344b61f0de 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,8 +19,10 @@ package org.apache.pinot.segment.local.segment.index.forward; +import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.CLPForwardIndexCreatorV1; import org.apache.pinot.segment.local.segment.index.readers.forward.CLPForwardIndexReaderV1; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBitMVEntryDictForwardIndexReader; @@ -30,6 +32,7 @@ import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBytePower2ChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkMVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkSVForwardIndexReader; import org.apache.pinot.segment.local.segment.index.readers.sorted.SortedIndexReaderImpl; @@ -106,6 +109,10 @@ public static ForwardIndexReader createRawIndexReader(PinotDataBuffer dataBuffer : new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType); } + if (version >= VarByteChunkForwardIndexWriterV5.VERSION) { + // V5 is the same as V4 except the multi-value docs have implicit value count rather than explicit + return new VarByteChunkForwardIndexReaderV5(dataBuffer, storedType, isSingleValue); + } if (version == VarByteChunkForwardIndexWriterV4.VERSION) { // V4 reader is common for sv var byte, mv fixed byte and mv var byte return new VarByteChunkForwardIndexReaderV4(dataBuffer, storedType, isSingleValue); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 277805d22c95..b8b007ec8dd1 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -67,8 +67,7 @@ public class VarByteChunkForwardIndexReaderV4 public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.DataType storedType, boolean isSingleValue) { - int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + validateIndexVersion(dataBuffer); _storedType = storedType; _targetDecompressedChunkSize = dataBuffer.getInt(4); _chunkCompressionType = ChunkCompressionType.valueOf(dataBuffer.getInt(8)); @@ -81,6 +80,11 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da _isSingleValue = isSingleValue; } + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + } + @Override public boolean isDictionaryEncoded() { return false; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 20569bf7ff4e..fd4528c27df7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -18,8 +18,10 @@ */ package org.apache.pinot.segment.local.segment.index.readers.forward; +import com.google.common.base.Preconditions; import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; @@ -35,6 +37,12 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da super(dataBuffer, storedType, isSingleValue); } + @Override + public void validateIndexVersion(PinotDataBuffer dataBuffer) { + int version = dataBuffer.getInt(0); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.VERSION, "Illegal index version: %s", version); + } + @Override protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { return byteBuffer.remaining() / _storedType.size(); From acfe86483567f04f6ceb6da1e1cd5bb3d646aeec Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 06:08:41 +0800 Subject: [PATCH 066/101] Fix minor style issue. --- .../io/writer/impl/VarByteChunkForwardIndexWriterV4.java | 4 ++-- .../segment/index/forward/ForwardIndexReaderFactory.java | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index b9f1f876396c..f94fd64cee75 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -77,7 +77,7 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; // Use the run-time concrete class to retrieve the logger - protected final Logger LOGGER = LoggerFactory.getLogger(this.getClass()); + protected final Logger _logger = LoggerFactory.getLogger(this.getClass()); private static final String DATA_BUFFER_SUFFIX = ".buf"; @@ -276,7 +276,7 @@ private void write(ByteBuffer buffer, boolean huge) { _chunkOffset += compressedSize; _docIdOffset = _nextDocId; } catch (IOException e) { - LOGGER.error("Exception caught while compressing/writing data chunk", e); + _logger.error("Exception caught while compressing/writing data chunk", e); throw new RuntimeException(e); } finally { if (mapped != null) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index a4344b61f0de..fdf1cfd1b96a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -19,7 +19,6 @@ package org.apache.pinot.segment.local.segment.index.forward; -import com.google.errorprone.annotations.Var; import java.util.Arrays; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; From a4751b632a95e6429a5c35d0f320450d1192e0ed Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:35:23 +0800 Subject: [PATCH 067/101] Deleted FixByteChunkMVForwardIndexReaderV2 --- .../FixedByteChunkMVForwardIndexReaderV2.java | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java deleted file mode 100644 index 762672928d72..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.readers.forward; - -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred - */ -public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { - - public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { - super(dataBuffer, storedType); - } - - @Override - public int getNumValuesMV(int docId, ChunkReaderContext context) { - ByteBuffer byteBuffer = slice(docId, context); - return getNumValuesMV(byteBuffer); - } - - @Override - protected int getNumValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); - } -} From 32062a11fecdc4634e5ae7a56693e500eb09674d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:44:48 +0800 Subject: [PATCH 068/101] Deleted FixByteChunkMVForwardIndexReaderV2Test --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java deleted file mode 100644 index f7105630c5e3..000000000000 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.creator; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -/** - Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and - reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with - the forward index creator version upgrade. - */ -public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - @BeforeClass - public void setup() - throws Exception { - _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; - FileUtils.forceMkdir(new File(_outputDir)); - } - - @Override - public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( - ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, - int writerVersion) - throws IOException { - return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - } - - @Override - public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, - int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); - } - - @Test - public void validateCompressionRatioIncrease() - throws IOException { - // Generate input data containing short MV docs with somewhat repetitive data - int numDocs = 1000000; - int numElements = 0; - int maxMVRowSize = 0; - List inputData = new ArrayList<>(numDocs); - for (int i = 0; i < numDocs; i++) { - long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; - maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); - numElements += mvRow.length; - for (int j = 0; j < mvRow.length; j++, numElements++) { - mvRow[j] = numElements % 10; - } - inputData.add(mvRow); - } - - for (int writerVersion : List.of(2, 4)) { - // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); - FileUtils.deleteQuietly(explicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( - explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = - new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); - FileUtils.deleteQuietly(implicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( - implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 - // 2. At least 2x larger than the implicit length variant when using Writer Version 4 - long expectedImplicitLengthFwdIndexMaxSize; - if (writerVersion == 2) { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); - } else { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); - } - Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - } -} From ef3f663c214aa0e825643bbfd9882fc807ecde38 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:22:15 +0800 Subject: [PATCH 069/101] Add VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java new file mode 100644 index 000000000000..bcc71b94dbad --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -0,0 +1,227 @@ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class VarByteChunkV5Test extends VarByteChunkV4Test { + private static final Random RANDOM = new Random(); + private static File[] _dirs; + + @DataProvider(parallel = true) + public Object[][] params() { + Object[][] params = new Object[][]{ + {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null + , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, + ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, + ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + }; + + for (int i = 0; i < _dirs.length; i++) { + params[i][0] = _dirs[i]; + } + + return params; + } + + @BeforeClass + public void forceMkDirs() + throws IOException { + _dirs = new File[10]; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + } + + @AfterClass + public void deleteDirs() { + for (File dir : _dirs) { + FileUtils.deleteQuietly(dir); + } + } + + @Test(dataProvider = "params") + public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringSVFile = new File(file, "testStringSV"); + testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, + VarByteChunkForwardIndexWriterV5::putString, (reader, context, docId) -> reader.getString(docId, context)); + FileUtils.deleteQuietly(stringSVFile); + } + + @Test(dataProvider = "params") + public void testBytesSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesSVFile = new File(file, "testBytesSV"); + testWriteRead(bytesSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, + x -> x.getBytes(StandardCharsets.UTF_8), VarByteChunkForwardIndexWriterV5::putBytes, + (reader, context, docId) -> reader.getBytes(docId, context)); + FileUtils.deleteQuietly(bytesSVFile); + } + + @Test(dataProvider = "params") + public void testStringMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File stringMVFile = new File(file, "testStringMV"); + testWriteRead(stringMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, + new StringSplitterMV(), VarByteChunkForwardIndexWriterV5::putStringMV, + (reader, context, docId) -> reader.getStringMV(docId, context)); + FileUtils.deleteQuietly(stringMVFile); + } + + @Test(dataProvider = "params") + public void testBytesMV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) + throws IOException { + File bytesMVFile = new File(file, "testBytesMV"); + testWriteRead(bytesMVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.BYTES, new ByteSplitterMV(), + VarByteChunkForwardIndexWriterV5::putBytesMV, (reader, context, docId) -> reader.getBytesMV(docId, context)); + FileUtils.deleteQuietly(bytesMVFile); + } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + int writerVersion = 5; + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = _dirs[0]; + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = _dirs[1]; + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be at least + // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 + long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + + static class ByteSplitterMV implements Function { + @Override + public byte[][] apply(String input) { + List res = new ArrayList<>(); + for (int i = 0; i < input.length(); i += 3) { + int endIndex = Math.min(i + 3, input.length()); + res.add(input.substring(i, endIndex).getBytes()); + } + return res.toArray(new byte[0][]); + } + } + + private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, + FieldSpec.DataType dataType, Function forwardMapper, + BiConsumer write, Read read) + throws IOException { + List values = randomStrings(1000, longestEntry).map(forwardMapper).collect(Collectors.toList()); + try (VarByteChunkForwardIndexWriterV5 writer = new VarByteChunkForwardIndexWriterV5(file, compressionType, + chunkSize)) { + for (T value : values) { + write.accept(writer, value); + } + } + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(file)) { + try (VarByteChunkForwardIndexReaderV5 reader = new VarByteChunkForwardIndexReaderV5(buffer, dataType, true); + VarByteChunkForwardIndexReaderV5.ReaderContext context = reader.createContext()) { + for (int i = 0; i < values.size(); i++) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 0; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = 1; i < values.size(); i += 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i--) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 2; i >= 0; i -= 2) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + for (int i = values.size() - 1; i >= 0; i -= 100) { + assertEquals(read.read(reader, context, i), values.get(i)); + } + } + } + } + + private Stream randomStrings(int count, int lengthOfLongestEntry) { + return IntStream.range(0, count).mapToObj(i -> { + int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); + byte[] bytes = new byte[length]; + if (length != 0) { + bytes[bytes.length - 1] = 'c'; + if (length > 2) { + Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); + } + bytes[0] = 'a'; + } + return new String(bytes, StandardCharsets.UTF_8); + }); + } + + @FunctionalInterface + interface Read { + T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); + } +} From 38a8cb6947494ad616711e8eb73e472fa4d7627f Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 17:56:48 +0800 Subject: [PATCH 070/101] Add license to VarByteChunkV5Test unit test --- .../index/creator/VarByteChunkV5Test.java | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index bcc71b94dbad..59e336d14384 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; @@ -37,11 +55,16 @@ public class VarByteChunkV5Test extends VarByteChunkV4Test { @DataProvider(parallel = true) public Object[][] params() { Object[][] params = new Object[][]{ - {null, ChunkCompressionType.LZ4, 20, 1024}, {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, {null - , ChunkCompressionType.PASS_THROUGH, 20, 1024}, {null, ChunkCompressionType.SNAPPY, 20, 1024}, {null, - ChunkCompressionType.ZSTANDARD, 20, 1024}, {null, ChunkCompressionType.LZ4, 2048, 1024}, {null, - ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, - {null, ChunkCompressionType.SNAPPY, 2048, 1024}, {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} + {null, ChunkCompressionType.LZ4, 20, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 20, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 20, 1024}, + {null, ChunkCompressionType.SNAPPY, 20, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 20, 1024}, + {null, ChunkCompressionType.LZ4, 2048, 1024}, + {null, ChunkCompressionType.LZ4_LENGTH_PREFIXED, 2048, 1024}, + {null, ChunkCompressionType.PASS_THROUGH, 2048, 1024}, + {null, ChunkCompressionType.SNAPPY, 2048, 1024}, + {null, ChunkCompressionType.ZSTANDARD, 2048, 1024} }; for (int i = 0; i < _dirs.length; i++) { @@ -124,12 +147,17 @@ public void validateCompressionRatioIncrease() inputData.add(mvRow); } - int writerVersion = 5; + for (int i = 0; i < _dirs.length; i++) { + _dirs[i] = new File(new File(FileUtils.getTempDirectory(), UUID.randomUUID().toString()), "VarByteChunkV5Test"); + FileUtils.forceMkdir(_dirs[i]); + } + // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = _dirs[0]; + int rawIndexVersionV4 = 4; + File explicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV4)); FileUtils.deleteQuietly(explicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(explicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 4)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV4)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } From b8dfacd04577af98f33aebdece7932926e9bb377 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:02:01 +0800 Subject: [PATCH 071/101] Improved unit test --- .../index/creator/VarByteChunkV4Test.java | 21 +++++++++- .../index/creator/VarByteChunkV5Test.java | 38 +++---------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 70313d91e701..3b83a793fcb0 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,6 +33,7 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -88,9 +89,25 @@ public void deleteDirs() { } } + public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { + return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); + } + @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) - throws IOException { + throws IOException, RuntimeException { File stringSVFile = new File(file, "testStringSV"); testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, VarByteChunkForwardIndexWriterV4::putString, (reader, context, docId) -> reader.getString(docId, context)); @@ -193,7 +210,7 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { + protected Stream randomStrings(int count, int lengthOfLongestEntry) { return IntStream.range(0, count) .mapToObj(i -> { int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index 59e336d14384..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -22,16 +22,12 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Random; import java.util.UUID; -import java.util.concurrent.ThreadLocalRandom; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -164,10 +160,11 @@ public void validateCompressionRatioIncrease() } // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = _dirs[1]; + int rawIndexVersionV5 = 5; + File implicitLengthFwdIndexFile = new File(FileUtils.getTempDirectory(), Integer.toString(rawIndexVersionV5)); FileUtils.deleteQuietly(implicitLengthFwdIndexFile); try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator(implicitLengthFwdIndexFile, - ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, 5)) { + ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, true, rawIndexVersionV5)) { for (long[] mvRow : inputData) { creator.putLongMV(mvRow); } @@ -177,18 +174,10 @@ public void validateCompressionRatioIncrease() // 2x larger size in explicit length variant in V4 compared to the new implicit length variant in V5 long expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - static class ByteSplitterMV implements Function { - @Override - public byte[][] apply(String input) { - List res = new ArrayList<>(); - for (int i = 0; i < input.length(); i += 3) { - int endIndex = Math.min(i + 3, input.length()); - res.add(input.substring(i, endIndex).getBytes()); - } - return res.toArray(new byte[0][]); - } + // Cleanup + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); } private void testWriteRead(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize, @@ -233,21 +222,6 @@ private void testWriteRead(File file, ChunkCompressionType compressionType, } } - private Stream randomStrings(int count, int lengthOfLongestEntry) { - return IntStream.range(0, count).mapToObj(i -> { - int length = ThreadLocalRandom.current().nextInt(lengthOfLongestEntry); - byte[] bytes = new byte[length]; - if (length != 0) { - bytes[bytes.length - 1] = 'c'; - if (length > 2) { - Arrays.fill(bytes, 1, bytes.length - 1, (byte) 'b'); - } - bytes[0] = 'a'; - } - return new String(bytes, StandardCharsets.UTF_8); - }); - } - @FunctionalInterface interface Read { T read(VarByteChunkForwardIndexReaderV5 reader, VarByteChunkForwardIndexReaderV5.ReaderContext context, int docId); From bd9bdee57f59b1f4e3af91a94aa45a89d3f996ba Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:29:40 +0800 Subject: [PATCH 072/101] Refactored unit test --- .../index/creator/VarByteChunkV4Test.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 3b83a793fcb0..6c8eb6d9fe89 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,7 +33,6 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -89,22 +88,6 @@ public void deleteDirs() { } } - public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { - return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); - } - @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) throws IOException, RuntimeException { From 5b6e29ef4b3cb8a5db56bee2d9d1b30d9bbb3088 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:40 +0800 Subject: [PATCH 073/101] Add blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index c5bbc75c2760..f80f764fe462 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 597762fdc84c0a72b8a284f5a973146f564c0bd1 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:59 +0800 Subject: [PATCH 074/101] Remove blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index f80f764fe462..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From ff21345782c1cda5835d81ebd9bf60404f02e2e1 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:42:48 +0800 Subject: [PATCH 075/101] Add blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index 30b15c0c1a6e..f7404dc64bd8 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,6 +71,7 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long + * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From cde2a6d89bf487fb48ff86eb825f66dfd5c23ab7 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:43:01 +0800 Subject: [PATCH 076/101] Remove blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index f7404dc64bd8..30b15c0c1a6e 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,7 +71,6 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long - * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From d430d61929ead71aecfcd3c955a7172d7594cf93 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Tue, 15 Oct 2024 05:18:27 +0800 Subject: [PATCH 077/101] Refactored code to utilize changes from Extract common MV ser/de logic into ArraySerDeUtils #14209 --- .../VarByteChunkForwardIndexWriterV5.java | 21 ++++++++++ .../MultiValueFixedByteRawIndexCreator.java | 22 +++++++--- .../forward/ForwardIndexReaderFactory.java | 5 +-- .../FixedByteChunkMVForwardIndexReader.java | 5 --- .../VarByteChunkForwardIndexReaderV5.java | 41 +++++++++++++++++-- 5 files changed, 77 insertions(+), 17 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index 72c94e210139..968891089b8f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -52,4 +53,24 @@ public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compress public int getVersion() { return VERSION; } + + @Override + public void putIntMV(int[] values) { + putBytes(ArraySerDeUtils.serializeIntArrayWithoutLength(values)); + } + + @Override + public void putLongMV(long[] values) { + putBytes(ArraySerDeUtils.serializeLongArrayWithoutLength(values)); + } + + @Override + public void putFloatMV(float[] values) { + putBytes(ArraySerDeUtils.serializeFloatArrayWithoutLength(values)); + } + + @Override + public void putDoubleMV(double[] values) { + putBytes(ArraySerDeUtils.serializeDoubleArrayWithoutLength(values)); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 4fafcbcaad5e..b8a6bd6daafd 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriter; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkWriter; import org.apache.pinot.segment.spi.V1Constants.Indexes; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -69,23 +70,32 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } - public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) throws IOException { - // Store the length followed by the values - int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); if (writerVersion < VarByteChunkForwardIndexWriterV4.VERSION) { + // Store the length followed by the values + int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); int numDocsPerChunk = deriveNumDocsPerChunk ? Math.max(targetMaxChunkSizeBytes / (totalMaxLength + VarByteChunkForwardIndexWriter.CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE), 1) : targetDocsPerChunk; _indexWriter = new VarByteChunkForwardIndexWriter(indexFile, compressionType, totalDocs, numDocsPerChunk, totalMaxLength, writerVersion); } else { - int chunkSize = - ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); - _indexWriter = new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize); + if (writerVersion == VarByteChunkForwardIndexWriterV5.VERSION) { + // Store only the values + int totalMaxLength = maxNumberOfMultiValueElements * valueType.getStoredType().size(); + int chunkSize = + ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); + _indexWriter = new VarByteChunkForwardIndexWriterV5(indexFile, compressionType, chunkSize); + } else { + // Store the length followed by the values + int totalMaxLength = Integer.BYTES + (maxNumberOfMultiValueElements * valueType.getStoredType().size()); + int chunkSize = + ForwardIndexUtils.getDynamicTargetChunkSize(totalMaxLength, targetDocsPerChunk, targetMaxChunkSizeBytes); + _indexWriter = new VarByteChunkForwardIndexWriterV4(indexFile, compressionType, chunkSize); + } } _valueType = valueType; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java index fdf1cfd1b96a..cc7201ed985f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexReaderFactory.java @@ -108,11 +108,10 @@ public static ForwardIndexReader createRawIndexReader(PinotDataBuffer dataBuffer : new FixedByteChunkSVForwardIndexReader(dataBuffer, storedType); } - if (version >= VarByteChunkForwardIndexWriterV5.VERSION) { + if (version == VarByteChunkForwardIndexWriterV5.VERSION) { // V5 is the same as V4 except the multi-value docs have implicit value count rather than explicit return new VarByteChunkForwardIndexReaderV5(dataBuffer, storedType, isSingleValue); - } - if (version == VarByteChunkForwardIndexWriterV4.VERSION) { + } else if (version == VarByteChunkForwardIndexWriterV4.VERSION) { // V4 reader is common for sv var byte, mv fixed byte and mv var byte return new VarByteChunkForwardIndexReaderV4(dataBuffer, storedType, isSingleValue); } else { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java index 96299ffb32c0..8e53ecb15639 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReader.java @@ -42,11 +42,6 @@ public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType s _maxChunkSize = _numDocsPerChunk * (ROW_OFFSET_SIZE + _lengthOfLongestEntry); } - public FixedByteChunkMVForwardIndexReader(PinotDataBuffer dataBuffer, DataType storedType, boolean explicitMVEntrySize) { - this(dataBuffer, storedType); - _explicitMVEntrySize = explicitMVEntrySize; - } - @Nullable @Override public ChunkReaderContext createContext() { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index fd4528c27df7..3221479c9f91 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -19,9 +19,9 @@ package org.apache.pinot.segment.local.segment.index.readers.forward; import com.google.common.base.Preconditions; -import java.nio.ByteBuffer; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; @@ -44,7 +44,42 @@ public void validateIndexVersion(PinotDataBuffer dataBuffer) { } @Override - protected int getNumFixedByteValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); + public int getIntMV(int docId, int[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeIntArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public int[] getIntMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeIntArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getLongMV(int docId, long[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeLongArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public long[] getLongMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeLongArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getFloatMV(int docId, float[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeFloatArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public float[] getFloatMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeFloatArrayWithoutLength(context.getValue(docId)); + } + + @Override + public int getDoubleMV(int docId, double[] valueBuffer, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeDoubleArrayWithoutLength(context.getValue(docId), valueBuffer); + } + + @Override + public double[] getDoubleMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderContext context) { + return ArraySerDeUtils.deserializeDoubleArrayWithoutLength(context.getValue(docId)); } } From 3f68f750e7a0e6b50da0cf691c00f60c7269c1fd Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:24:41 +0800 Subject: [PATCH 078/101] Rebased to latest --- .../segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index c681265ffb9f..e15bb49d0ee5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index b8a6bd6daafd..143f4a0ae7cd 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -70,6 +70,7 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } + public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) From 27328bf58fabd31592d137ca60515f3214ec51aa Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 05:31:50 +0800 Subject: [PATCH 079/101] Rebase to latest --- .../creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index 143f4a0ae7cd..b8a6bd6daafd 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -70,7 +70,6 @@ public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType c ForwardIndexConfig.DEFAULT_TARGET_DOCS_PER_CHUNK); } - public MultiValueFixedByteRawIndexCreator(File indexFile, ChunkCompressionType compressionType, int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) From 79c6f66c20a5efc16f865cba3d0c0c6e5b70af86 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:21:34 +0800 Subject: [PATCH 080/101] Refactored code to use new class versions. --- .../impl/fwd/CLPForwardIndexCreatorV1.java | 2 +- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ++++++++++++++++++ .../FixedByteChunkMVForwardIndexReaderV2.java | 45 +++++++ ...tiValueFixedByteRawIndexCreatorV2Test.java | 52 ++++++++ 4 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java index e15bb49d0ee5..c681265ffb9f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV1.java @@ -120,7 +120,7 @@ public CLPForwardIndexCreatorV1(File baseIndexDir, String column, int numDocs, C _encodedVarsFwdIndexFile = new File(_intermediateFilesDir, column + "_clp_encodedvars.fwd"); _encodedVarsFwdIndexWriter = new MultiValueFixedByteRawIndexCreator(_encodedVarsFwdIndexFile, ChunkCompressionType.LZ4, numDocs, - FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, true, + FieldSpec.DataType.LONG, _clpStats.getMaxNumberOfEncodedVars(), false, VarByteChunkForwardIndexWriterV4.VERSION); _clpStats.clear(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java new file mode 100644 index 000000000000..90bc9281d23a --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator.impl.fwd; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. + */ +public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { + /** + * Create a var-byte raw index creator for the given column + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression to use + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk + * @param writerVersion writer format version + * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when + * deriveNumDocsPerChunk is true + */ + public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, + int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, + int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, + deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion); + } + + public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, + DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, + int targetMaxChunkSizeBytes, int targetDocsPerChunk) + throws IOException { + super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, + writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); + } + + @Override + protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { + return maxNumberOfMultiValueElements * valueType.getStoredType().size(); + } + + @Override + public void putIntMV(int[] values) { + byte[] bytes = new byte[values.length * Integer.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (int value : values) { + byteBuffer.putInt(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putLongMV(long[] values) { + byte[] bytes = new byte[values.length * Long.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (long value : values) { + byteBuffer.putLong(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putFloatMV(float[] values) { + byte[] bytes = new byte[values.length * Float.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (float value : values) { + byteBuffer.putFloat(value); + } + _indexWriter.putBytes(bytes); + } + + @Override + public void putDoubleMV(double[] values) { + byte[] bytes = new byte[values.length * Double.BYTES]; + ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + //write the content of each element + for (double value : values) { + byteBuffer.putDouble(value); + } + _indexWriter.putBytes(bytes); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java new file mode 100644 index 000000000000..762672928d72 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.readers.forward; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred + */ +public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { + + public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { + super(dataBuffer, storedType); + } + + @Override + public int getNumValuesMV(int docId, ChunkReaderContext context) { + ByteBuffer byteBuffer = slice(docId, context); + return getNumValuesMV(byteBuffer); + } + + @Override + protected int getNumValuesMV(ByteBuffer byteBuffer) { + return byteBuffer.remaining() / _storedType.size(); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java new file mode 100644 index 000000000000..e4937a12ab3f --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.function.IntFunction; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; +import org.apache.pinot.segment.spi.V1Constants.Indexes; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + // TODO: +} From e9778d33623c5f82f5f7b308387702146df32181 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:25:29 +0800 Subject: [PATCH 081/101] Fixed style. --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index e4937a12ab3f..8a3e2e4fc65e 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,35 +18,6 @@ */ package org.apache.pinot.segment.local.segment.index.creator; -import java.io.File; -import java.io.IOException; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import java.util.function.IntFunction; -import java.util.function.ToIntFunction; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReader; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; -import org.apache.pinot.segment.spi.V1Constants.Indexes; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; -import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - - public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { // TODO: } From b43f6769205cb33e292729a9147dadd8e2a1b248 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:32:21 +0800 Subject: [PATCH 082/101] Refactored MultiValueFixedByteRawIndexCreatorTest.java --- ...MultiValueFixedByteRawIndexCreatorTest.java | 6 ++++++ ...ltiValueFixedByteRawIndexCreatorV2Test.java | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index 9a2105726aa7..a22260a5950e 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -196,6 +196,12 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); + } + interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 8a3e2e4fc65e..1c90b8092f6a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,6 +18,22 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; +import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec; + + +/** + Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - // TODO: + @Override + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, + dataType.getStoredType(), false) + : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + } } From c6033b96fd3c0d67522c50f4a3fb80cbcc4f55a3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 08:36:46 +0800 Subject: [PATCH 083/101] Fix style. --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index 1c90b8092f6a..c07257901009 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -31,9 +31,9 @@ */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { + protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); + dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } } From 3f654e497c9172dd0cc77fff1a0c8826360a33f3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 14:25:36 +0800 Subject: [PATCH 084/101] Modified existing unit test and extended it for MultiValueFixedByteRawIndexCreatorV2Test.java --- ...ultiValueFixedByteRawIndexCreatorTest.java | 6 ----- ...tiValueFixedByteRawIndexCreatorV2Test.java | 27 +++++++++++++++++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java index a22260a5950e..9a2105726aa7 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorTest.java @@ -196,12 +196,6 @@ public void testMV(DataType dataType, List inputs, ToIntFunction sizeo } } - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, DataType dataType, int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV4(buffer, - dataType.getStoredType(), false) - : new FixedByteChunkMVForwardIndexReader(buffer, dataType.getStoredType()); - } - interface Extractor { T extract(ForwardIndexReader reader, ForwardIndexReaderContext context, int offset, T buffer); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index c07257901009..eae997014e22 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -18,20 +18,43 @@ */ package org.apache.pinot.segment.local.segment.index.creator; +import java.io.File; +import java.io.IOException; +import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; +import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.annotations.BeforeClass; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but the forward index creator and reader are newer version + Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { + @BeforeClass + public void setup() + throws Exception { + _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; + FileUtils.forceMkdir(new File(_outputDir)); + } + + @Override + public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( + ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, + int writerVersion) + throws IOException { + return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, + maxElements, false, writerVersion, 1024 * 1024, 1000); + } + @Override - protected ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, + public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, int writerVersion) { return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); From 12af1ceb59aea8201b5a118efe6559e8f6b62ef9 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 18:15:33 +0800 Subject: [PATCH 085/101] Improved unit test for MultiValueFixedByteRawIndexCreatorTest and MultiValueFixedByteRawIndexCreatorV2Test --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index eae997014e22..dd3d0b1156b8 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -20,6 +20,8 @@ import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; @@ -30,11 +32,15 @@ import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; import org.apache.pinot.spi.data.FieldSpec; +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; /** - Same as MultiValueFixedByteRawIndexCreatorTest, but newer version of forward index creator and reader are used + Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and + reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with + the forward index creator version upgrade. */ public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { @BeforeClass @@ -59,4 +65,59 @@ public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpe return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); } + + @Test + public void validateCompressionRatioIncrease() + throws IOException { + // Generate input data containing short MV docs with somewhat repetitive data + int numDocs = 1000000; + int numElements = 0; + int maxMVRowSize = 0; + List inputData = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; + maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); + numElements += mvRow.length; + for (int j = 0; j < mvRow.length; j++, numElements++) { + mvRow[j] = numElements % 10; + } + inputData.add(mvRow); + } + + for (int writerVersion : List.of(2, 4)) { + // Generate MV fixed byte raw fwd index with explicit length + File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); + FileUtils.deleteQuietly(explicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( + explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // Generate MV fixed byte raw fwd index with implicit length + File implicitLengthFwdIndexFile = + new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); + FileUtils.deleteQuietly(implicitLengthFwdIndexFile); + try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( + implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, + true, writerVersion)) { + for (long[] mvRow : inputData) { + creator.putLongMV(mvRow); + } + } + + // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: + // 1. At least 15% larger than the implicit length variant when using Writer Version 2 + // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + long expectedImplicitLengthFwdIndexMaxSize; + if (writerVersion == 2) { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); + } else { + expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); + } + Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); + } + } } From 063c5b48e3ce3be65962e8018820372d4a832fbd Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 2 Oct 2024 23:45:49 +0800 Subject: [PATCH 086/101] Adjusted comments content --- .../creator/MultiValueFixedByteRawIndexCreatorV2Test.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java index dd3d0b1156b8..f7105630c5e3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java @@ -109,8 +109,8 @@ public void validateCompressionRatioIncrease() } // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 15% larger than the implicit length variant when using Writer Version 2 - // 2. At least 200% larger than the implicit length variant when using Writer Version 4 + // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 + // 2. At least 2x larger than the implicit length variant when using Writer Version 4 long expectedImplicitLengthFwdIndexMaxSize; if (writerVersion == 2) { expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); From b8794f285f1a0aa40929edcdb207bf3fac1791f5 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 05:51:53 +0800 Subject: [PATCH 087/101] Upgrade MVFixedByteRawIndex reader and writer from V4 to V5, retain forward index creator version. --- .../MultiValueFixedByteRawIndexCreatorV2.java | 116 ------------------ 1 file changed, 116 deletions(-) delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java deleted file mode 100644 index 90bc9281d23a..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreatorV2.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.creator.impl.fwd; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as MultiValueFixedByteRawIndexCreator, but without storing the number of elements for each row. - */ -public class MultiValueFixedByteRawIndexCreatorV2 extends MultiValueFixedByteRawIndexCreator { - /** - * Create a var-byte raw index creator for the given column - * - * @param baseIndexDir Index directory - * @param compressionType Type of compression to use - * @param column Name of column to index - * @param totalDocs Total number of documents to index - * @param valueType Type of the values - * @param deriveNumDocsPerChunk true if writer should auto-derive the number of rows per chunk - * @param writerVersion writer format version - * @param targetMaxChunkSizeBytes target max chunk size in bytes, applicable only for V4 or when - * deriveNumDocsPerChunk is true - */ - public MultiValueFixedByteRawIndexCreatorV2(File baseIndexDir, ChunkCompressionType compressionType, String column, - int totalDocs, DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, - int writerVersion, int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(baseIndexDir, compressionType, column, totalDocs, valueType, maxNumberOfMultiValueElements, - deriveNumDocsPerChunk, writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion); - } - - public MultiValueFixedByteRawIndexCreatorV2(File indexFile, ChunkCompressionType compressionType, int totalDocs, - DataType valueType, int maxNumberOfMultiValueElements, boolean deriveNumDocsPerChunk, int writerVersion, - int targetMaxChunkSizeBytes, int targetDocsPerChunk) - throws IOException { - super(indexFile, compressionType, totalDocs, valueType, maxNumberOfMultiValueElements, deriveNumDocsPerChunk, - writerVersion, targetMaxChunkSizeBytes, targetDocsPerChunk); - } - - @Override - protected int computeTotalMaxLength(int maxNumberOfMultiValueElements, DataType valueType) { - return maxNumberOfMultiValueElements * valueType.getStoredType().size(); - } - - @Override - public void putIntMV(int[] values) { - byte[] bytes = new byte[values.length * Integer.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (int value : values) { - byteBuffer.putInt(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putLongMV(long[] values) { - byte[] bytes = new byte[values.length * Long.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (long value : values) { - byteBuffer.putLong(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putFloatMV(float[] values) { - byte[] bytes = new byte[values.length * Float.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (float value : values) { - byteBuffer.putFloat(value); - } - _indexWriter.putBytes(bytes); - } - - @Override - public void putDoubleMV(double[] values) { - byte[] bytes = new byte[values.length * Double.BYTES]; - ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); - //write the content of each element - for (double value : values) { - byteBuffer.putDouble(value); - } - _indexWriter.putBytes(bytes); - } -} From 9812e3ee87ee63ac2f669e3fb4adaf6faa45a8bd Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:35:23 +0800 Subject: [PATCH 088/101] Deleted FixByteChunkMVForwardIndexReaderV2 --- .../FixedByteChunkMVForwardIndexReaderV2.java | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java deleted file mode 100644 index 762672928d72..000000000000 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/FixedByteChunkMVForwardIndexReaderV2.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.readers.forward; - -import java.nio.ByteBuffer; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec.DataType; - - -/** - Same as FixedByteChunkMVForwardIndexReader, but the number of elements for each row is inferred - */ -public final class FixedByteChunkMVForwardIndexReaderV2 extends FixedByteChunkMVForwardIndexReader { - - public FixedByteChunkMVForwardIndexReaderV2(PinotDataBuffer dataBuffer, DataType storedType) { - super(dataBuffer, storedType); - } - - @Override - public int getNumValuesMV(int docId, ChunkReaderContext context) { - ByteBuffer byteBuffer = slice(docId, context); - return getNumValuesMV(byteBuffer); - } - - @Override - protected int getNumValuesMV(ByteBuffer byteBuffer) { - return byteBuffer.remaining() / _storedType.size(); - } -} From 085aed6fbe34d67815bfa7124afc4b5c12f938f7 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 16:44:48 +0800 Subject: [PATCH 089/101] Deleted FixByteChunkMVForwardIndexReaderV2Test --- ...tiValueFixedByteRawIndexCreatorV2Test.java | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java deleted file mode 100644 index f7105630c5e3..000000000000 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/MultiValueFixedByteRawIndexCreatorV2Test.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.segment.local.segment.index.creator; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreator; -import org.apache.pinot.segment.local.segment.creator.impl.fwd.MultiValueFixedByteRawIndexCreatorV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.FixedByteChunkMVForwardIndexReaderV2; -import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV5; -import org.apache.pinot.segment.spi.compression.ChunkCompressionType; -import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; -import org.apache.pinot.segment.spi.memory.PinotDataBuffer; -import org.apache.pinot.spi.data.FieldSpec; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -/** - Similar to MultiValueFixedByteRawIndexCreatorTest, but utilizes the newer version of the forward index creator and - reader. Additionally, this test class includes a validation test for checking the compression ratio improvement with - the forward index creator version upgrade. - */ -public class MultiValueFixedByteRawIndexCreatorV2Test extends MultiValueFixedByteRawIndexCreatorTest { - @BeforeClass - public void setup() - throws Exception { - _outputDir = System.getProperty("java.io.tmpdir") + File.separator + "mvFixedRawV2Test"; - FileUtils.forceMkdir(new File(_outputDir)); - } - - @Override - public MultiValueFixedByteRawIndexCreator getMultiValueFixedByteRawIndexCreator( - ChunkCompressionType compressionType, String column, int numDocs, FieldSpec.DataType dataType, int maxElements, - int writerVersion) - throws IOException { - return new MultiValueFixedByteRawIndexCreatorV2(new File(_outputDir), compressionType, column, numDocs, dataType, - maxElements, false, writerVersion, 1024 * 1024, 1000); - } - - @Override - public ForwardIndexReader getForwardIndexReader(PinotDataBuffer buffer, FieldSpec.DataType dataType, - int writerVersion) { - return writerVersion == VarByteChunkForwardIndexWriterV4.VERSION ? new VarByteChunkForwardIndexReaderV5(buffer, - dataType.getStoredType(), false) : new FixedByteChunkMVForwardIndexReaderV2(buffer, dataType.getStoredType()); - } - - @Test - public void validateCompressionRatioIncrease() - throws IOException { - // Generate input data containing short MV docs with somewhat repetitive data - int numDocs = 1000000; - int numElements = 0; - int maxMVRowSize = 0; - List inputData = new ArrayList<>(numDocs); - for (int i = 0; i < numDocs; i++) { - long[] mvRow = new long[Math.abs((int) Math.floor(RANDOM.nextGaussian()))]; - maxMVRowSize = Math.max(maxMVRowSize, mvRow.length); - numElements += mvRow.length; - for (int j = 0; j < mvRow.length; j++, numElements++) { - mvRow[j] = numElements % 10; - } - inputData.add(mvRow); - } - - for (int writerVersion : List.of(2, 4)) { - // Generate MV fixed byte raw fwd index with explicit length - File explicitLengthFwdIndexFile = new File(_outputDir, MultiValueFixedByteRawIndexCreator.class.getSimpleName()); - FileUtils.deleteQuietly(explicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreator creator = new MultiValueFixedByteRawIndexCreator( - explicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // Generate MV fixed byte raw fwd index with implicit length - File implicitLengthFwdIndexFile = - new File(_outputDir, MultiValueFixedByteRawIndexCreatorV2.class.getSimpleName()); - FileUtils.deleteQuietly(implicitLengthFwdIndexFile); - try (MultiValueFixedByteRawIndexCreatorV2 creator = new MultiValueFixedByteRawIndexCreatorV2( - implicitLengthFwdIndexFile, ChunkCompressionType.ZSTANDARD, numDocs, FieldSpec.DataType.LONG, numElements, - true, writerVersion)) { - for (long[] mvRow : inputData) { - creator.putLongMV(mvRow); - } - } - - // For the input data, the explicit length compressed MV fixed byte raw forward index is expected to be: - // 1. At least 1.15x larger than the implicit length variant when using Writer Version 2 - // 2. At least 2x larger than the implicit length variant when using Writer Version 4 - long expectedImplicitLengthFwdIndexMaxSize; - if (writerVersion == 2) { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 1.15d); - } else { - expectedImplicitLengthFwdIndexMaxSize = Math.round(implicitLengthFwdIndexFile.length() * 2.0d); - } - Assert.assertTrue(expectedImplicitLengthFwdIndexMaxSize < explicitLengthFwdIndexFile.length()); - } - } -} From 7e1d10cc4449eb86dbdcc471aad63fb491ee468e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:02:01 +0800 Subject: [PATCH 090/101] Improved unit test --- .../index/creator/VarByteChunkV4Test.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 6c8eb6d9fe89..3b83a793fcb0 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,6 +33,7 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -88,6 +89,22 @@ public void deleteDirs() { } } + public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, + int chunkSize) + throws IOException { + return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); + } + + public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { + return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); + } + @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) throws IOException, RuntimeException { From d06dda49737eb1d47d5117ed2a4bdaa01f529aaf Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 22:29:40 +0800 Subject: [PATCH 091/101] Refactored unit test --- .../index/creator/VarByteChunkV4Test.java | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 3b83a793fcb0..6c8eb6d9fe89 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -33,7 +33,6 @@ import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; -import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.segment.index.readers.forward.VarByteChunkForwardIndexReaderV4; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; @@ -89,22 +88,6 @@ public void deleteDirs() { } } - public VarByteChunkForwardIndexWriterV5 writerSupplierV5(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV5(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexWriterV4 writerSupplierV4(File file, ChunkCompressionType compressionType, - int chunkSize) - throws IOException { - return new VarByteChunkForwardIndexWriterV4(file, compressionType, chunkSize); - } - - public VarByteChunkForwardIndexReaderV4 readerSupplierV4(PinotDataBuffer buffer, FieldSpec.DataType dataType, boolean isSingleValue) { - return new VarByteChunkForwardIndexReaderV4(buffer, dataType, isSingleValue); - } - @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) throws IOException, RuntimeException { From e9835c56973ce2510a7782df1fce149284908776 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:40 +0800 Subject: [PATCH 092/101] Add blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index c5bbc75c2760..f80f764fe462 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 06c0b9511a6e34ae1e79807da0b31acf4f8c06f5 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 9 Oct 2024 23:35:59 +0800 Subject: [PATCH 093/101] Remove blank line --- .../segment/local/segment/index/creator/VarByteChunkV5Test.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java index f80f764fe462..c5bbc75c2760 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV5Test.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.pinot.segment.local.segment.index.creator; import java.io.File; From 07d6f75f2263aaa784afe10bbf6cf4799910eb9d Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:42:48 +0800 Subject: [PATCH 094/101] Add blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index 30b15c0c1a6e..f7404dc64bd8 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,6 +71,7 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long + * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From 680fc2403123c1b7c14976bfc417fc40074e6852 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 10 Oct 2024 22:43:01 +0800 Subject: [PATCH 095/101] Remove blank line --- .../pinot/integration/tests/UpsertTableIntegrationTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java index f7404dc64bd8..30b15c0c1a6e 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/UpsertTableIntegrationTest.java @@ -71,7 +71,6 @@ * - Dimension fields: playerId:int (primary key), name:string, game:string, deleted:boolean * - Metric fields: score:float * - DataTime fields: timestampInEpoch:long - * */ public class UpsertTableIntegrationTest extends BaseClusterIntegrationTestSet { private static final String INPUT_DATA_SMALL_TAR_FILE = "gameScores_csv.tar.gz"; From 1b222342cafc488ab7235b52b30e3a77efb64277 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Tue, 15 Oct 2024 05:50:31 +0800 Subject: [PATCH 096/101] Removed redundant RuntimeException from method signature --- .../segment/local/segment/index/creator/VarByteChunkV4Test.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java index 6c8eb6d9fe89..8387c93dc838 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/VarByteChunkV4Test.java @@ -90,7 +90,7 @@ public void deleteDirs() { @Test(dataProvider = "params") public void testStringSV(File file, ChunkCompressionType compressionType, int longestEntry, int chunkSize) - throws IOException, RuntimeException { + throws IOException { File stringSVFile = new File(file, "testStringSV"); testWriteRead(stringSVFile, compressionType, longestEntry, chunkSize, FieldSpec.DataType.STRING, x -> x, VarByteChunkForwardIndexWriterV4::putString, (reader, context, docId) -> reader.getString(docId, context)); From 0da0ca7590019992d0e459cc50049c9853db4dd3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 17 Oct 2024 02:11:12 +0800 Subject: [PATCH 097/101] Updated javadoc for VarByteChunkForwardIndexWriterV5 --- .../VarByteChunkForwardIndexWriterV5.java | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index 968891089b8f..a9f6fad975b6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -26,14 +26,54 @@ /** - * Forward index writer that extends {@link VarByteChunkForwardIndexWriterV4} with the only difference being the - * version tag is now bumped from 4 to 5. + * Forward index writer that extends {@link VarByteChunkForwardIndexWriterV4} and overrides the data layout for + * multi-value fixed byte operations to improve space efficiency. * - *

The {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static variables - * are shadowed in the child class thus associated with the class that defines them, care must be taken to ensure - * that the parent class can correctly observe the child class's {@code VERSION} value at runtime.

+ *

Consider the following multi-value document as an example: {@code [int(1), int(2), int(3)]}. + * The current binary data layout in {@code VarByteChunkForwardIndexWriterV4} is as follows:

+ *
+ *     0x00000010 0x00000003 0x00000001 0x00000002 0x00000003
+ * 
* - *

To achieve this, the {@code getVersion()} method is overridden to return the concrete subclass's + *

    + *
  1. The first 4 bytes ({@code 0x00000010}) represent the total payload length of the byte array + * containing the multi-value document content, which in this case is 16 bytes.
  2. + * + *
  3. The next 4 bytes ({@code 0x00000003}) represent the number of elements in the multi-value document (i.e., 3) + * .
  4. + * + *
  5. The remaining 12 bytes ({@code 0x00000001 0x00000002 0x00000003}) represent the 3 integer values of the + * multi-value document: 1, 2, and 3.
  6. + *
+ * + *

In Pinot, the fixed byte raw forward index can only store one specific fixed-length data type: + * {@code int}, {@code long}, {@code float}, or {@code double}. Instead of explicitly storing the number of elements + * for each document for multi-value document, this value can be inferred by:

+ *
+ *     number of elements = buffer payload length / size of data type
+ * 
+ * + *

If the forward index uses the passthrough chunk compression type (i.e., no compression), we can save + * 4 bytes per document by omitting the explicit element count. This leads to the following space savings:

+ * + *
    + *
  • For documents with 0 elements, we save 50%.
  • + *
  • For documents with 1 element, we save 33%.
  • + *
  • For documents with 2 elements, we save 25%.
  • + *
  • As the number of elements increases, the percentage of space saved decreases.
  • + *
+ * + *

For forward indexes that use compression to reduce data size, the savings can be even more significant + * in certain cases. This is demonstrated in the unit test {@link VarByteChunkV5Test#validateCompressionRatioIncrease}, + * where ZStandard was used as the chunk compressor. In the test, 1 million short multi-value (MV) documents + * were inserted, following a Gaussian distribution for document lengths. Additionally, the values of each integer + * in the MV documents were somewhat repetitive. Under these conditions, we observed a 50%+ reduction in on-disk + * file size compared to the V4 forward index writer version.

+ * + *

Note that the {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static + * variables are shadowed in the child class thus associated with the class that defines them, care must be taken to + * ensure that the parent class can correctly observe the child class's {@code VERSION} value at runtime. To handle + * this cleanly and correctly, the {@code getVersion()} method is overridden to return the concrete subclass's * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference * to the parent class.

* From 89ec8af7bacc19d9ac6415968036ee81588bcb2f Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 17 Oct 2024 02:44:14 +0800 Subject: [PATCH 098/101] Addressed code review comments to use `getVersion()` in forward index reader to fetch version number. --- .../impl/VarByteChunkForwardIndexWriterV4.java | 10 ++++++++-- .../impl/VarByteChunkForwardIndexWriterV5.java | 14 ++++++++++---- .../forward/VarByteChunkForwardIndexReaderV4.java | 3 ++- .../forward/VarByteChunkForwardIndexReaderV5.java | 3 ++- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index f94fd64cee75..52e6cb45c6ea 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -107,7 +107,13 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress writeHeader(_chunkCompressor.compressionType(), chunkSize); } - public int getVersion() { + // Child class must shadow this static method + public static int getVersion() { + return VERSION; + } + + // Child class must override this class instance method + protected int getConcreteClassVersion() { return VERSION; } @@ -115,7 +121,7 @@ private void writeHeader(ChunkCompressionType compressionType, int targetDecompr throws IOException { // keep metadata BE for backwards compatibility // (e.g. the version needs to be read by a factory which assumes BE) - _output.writeInt(getVersion()); + _output.writeInt(getConcreteClassVersion()); _output.writeInt(targetDecompressedChunkSize); _output.writeInt(compressionType.getValue()); // reserve a slot to write the data offset into diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index a9f6fad975b6..35b7b70f1f57 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -73,12 +73,12 @@ *

Note that the {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static * variables are shadowed in the child class thus associated with the class that defines them, care must be taken to * ensure that the parent class can correctly observe the child class's {@code VERSION} value at runtime. To handle - * this cleanly and correctly, the {@code getVersion()} method is overridden to return the concrete subclass's - * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference + * this cleanly and correctly, the {@code getConcreteClassVersion()} method is overridden to return the concrete + * subclass's {@code VERSION} value, ensuring that the correct version number is returned even when using a reference * to the parent class.

* * @see VarByteChunkForwardIndexWriterV4 - * @see VarByteChunkForwardIndexWriterV5#getVersion() + * @see VarByteChunkForwardIndexWriterV5#getConcreteClassVersion() */ @NotThreadSafe public class VarByteChunkForwardIndexWriterV5 extends VarByteChunkForwardIndexWriterV4 { @@ -89,8 +89,14 @@ public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compress super(file, compressionType, chunkSize); } + // Hide the parent class getVersion() + public static int getVersion() { + return VERSION; + } + + // Override the parent class getConcreteClassVersion(); @Override - public int getVersion() { + public int getConcreteClassVersion() { return VERSION; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index b8b007ec8dd1..ac1e739cb9ff 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -82,7 +82,8 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da public void validateIndexVersion(PinotDataBuffer dataBuffer) { int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.VERSION, "Illegal index version: %s", version); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.getVersion(), "Illegal index version: %s", + version); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index 3221479c9f91..a3c7ba860749 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -40,7 +40,8 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da @Override public void validateIndexVersion(PinotDataBuffer dataBuffer) { int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.VERSION, "Illegal index version: %s", version); + Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.getVersion(), "Illegal index version: %s", + version); } @Override From 592967aaef69255dd03af67bbd90ccbe46a86685 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 17 Oct 2024 05:32:16 +0800 Subject: [PATCH 099/101] Addressed final minor code review suggestion. --- .../io/writer/impl/VarByteChunkForwardIndexWriterV4.java | 5 ----- .../io/writer/impl/VarByteChunkForwardIndexWriterV5.java | 5 ----- .../readers/forward/VarByteChunkForwardIndexReaderV4.java | 7 +++++-- .../readers/forward/VarByteChunkForwardIndexReaderV5.java | 7 ++----- 4 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index 52e6cb45c6ea..9946e0a2c972 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -107,11 +107,6 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress writeHeader(_chunkCompressor.compressionType(), chunkSize); } - // Child class must shadow this static method - public static int getVersion() { - return VERSION; - } - // Child class must override this class instance method protected int getConcreteClassVersion() { return VERSION; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index 35b7b70f1f57..4c336c3f08df 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -89,11 +89,6 @@ public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compress super(file, compressionType, chunkSize); } - // Hide the parent class getVersion() - public static int getVersion() { - return VERSION; - } - // Override the parent class getConcreteClassVersion(); @Override public int getConcreteClassVersion() { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index ac1e739cb9ff..cf2a8b4de4da 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -82,8 +82,11 @@ public VarByteChunkForwardIndexReaderV4(PinotDataBuffer dataBuffer, FieldSpec.Da public void validateIndexVersion(PinotDataBuffer dataBuffer) { int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV4.getVersion(), "Illegal index version: %s", - version); + Preconditions.checkState(version == getVersion(), "Illegal index version: %s", version); + } + + public int getVersion() { + return VarByteChunkForwardIndexWriterV4.VERSION; } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java index a3c7ba860749..e72fedfc584e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV5.java @@ -18,7 +18,6 @@ */ package org.apache.pinot.segment.local.segment.index.readers.forward; -import com.google.common.base.Preconditions; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; import org.apache.pinot.segment.local.utils.ArraySerDeUtils; @@ -38,10 +37,8 @@ public VarByteChunkForwardIndexReaderV5(PinotDataBuffer dataBuffer, FieldSpec.Da } @Override - public void validateIndexVersion(PinotDataBuffer dataBuffer) { - int version = dataBuffer.getInt(0); - Preconditions.checkState(version == VarByteChunkForwardIndexWriterV5.getVersion(), "Illegal index version: %s", - version); + public int getVersion() { + return VarByteChunkForwardIndexWriterV5.VERSION; } @Override From 44b0df8538a53ee536a8927dd64a76af74b5e142 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 17 Oct 2024 12:22:33 +0800 Subject: [PATCH 100/101] Change getConcreteClassVersion back to getVersion --- .../impl/VarByteChunkForwardIndexWriterV4.java | 4 ++-- .../impl/VarByteChunkForwardIndexWriterV5.java | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index 9946e0a2c972..eea27036e373 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -108,7 +108,7 @@ public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compress } // Child class must override this class instance method - protected int getConcreteClassVersion() { + protected int getVersion() { return VERSION; } @@ -116,7 +116,7 @@ private void writeHeader(ChunkCompressionType compressionType, int targetDecompr throws IOException { // keep metadata BE for backwards compatibility // (e.g. the version needs to be read by a factory which assumes BE) - _output.writeInt(getConcreteClassVersion()); + _output.writeInt(getVersion()); _output.writeInt(targetDecompressedChunkSize); _output.writeInt(compressionType.getValue()); // reserve a slot to write the data offset into diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java index 4c336c3f08df..b96812a05936 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV5.java @@ -73,12 +73,12 @@ *

Note that the {@code VERSION} tag is a {@code static final} class variable set to {@code 5}. Since static * variables are shadowed in the child class thus associated with the class that defines them, care must be taken to * ensure that the parent class can correctly observe the child class's {@code VERSION} value at runtime. To handle - * this cleanly and correctly, the {@code getConcreteClassVersion()} method is overridden to return the concrete - * subclass's {@code VERSION} value, ensuring that the correct version number is returned even when using a reference - * to the parent class.

+ * this cleanly and correctly, the {@code getVersion()} method is overridden to return the concrete subclass's + * {@code VERSION} value, ensuring that the correct version number is returned even when using a reference to the + * parent class.

* * @see VarByteChunkForwardIndexWriterV4 - * @see VarByteChunkForwardIndexWriterV5#getConcreteClassVersion() + * @see VarByteChunkForwardIndexWriterV5#getVersion() */ @NotThreadSafe public class VarByteChunkForwardIndexWriterV5 extends VarByteChunkForwardIndexWriterV4 { @@ -89,9 +89,9 @@ public VarByteChunkForwardIndexWriterV5(File file, ChunkCompressionType compress super(file, compressionType, chunkSize); } - // Override the parent class getConcreteClassVersion(); + // Override the parent class getVersion(); @Override - public int getConcreteClassVersion() { + public int getVersion() { return VERSION; } From 6fe4517362b08df53d48046eab17f1640597a8f4 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 18 Oct 2024 04:03:29 +0800 Subject: [PATCH 101/101] Adjusted member variable scope in VarByteChunkForwardIndexWriterV4 --- .../io/writer/impl/VarByteChunkForwardIndexWriterV4.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index eea27036e373..332c52d0c59f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -77,12 +77,12 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { public static final int VERSION = 4; // Use the run-time concrete class to retrieve the logger - protected final Logger _logger = LoggerFactory.getLogger(this.getClass()); + protected final Logger _logger = LoggerFactory.getLogger(getClass()); private static final String DATA_BUFFER_SUFFIX = ".buf"; private final File _dataBuffer; - protected final RandomAccessFile _output; + private final RandomAccessFile _output; private final FileChannel _dataChannel; private final ByteBuffer _chunkBuffer; private final ByteBuffer _compressionBuffer;