Skip to content

Commit

Permalink
Enhance optimizeDictionary to optionally optimize var-width type co…
Browse files Browse the repository at this point in the history
…ls (#13994)
  • Loading branch information
itschrispeck authored Oct 14, 2024
1 parent 5c06547 commit afb63bc
Show file tree
Hide file tree
Showing 8 changed files with 299 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,9 @@ private boolean createDictionaryForColumn(ColumnIndexCreationInfo info, SegmentG

FieldIndexConfigs fieldIndexConfigs = config.getIndexConfigsByColName().get(column);
if (DictionaryIndexType.ignoreDictionaryOverride(config.isOptimizeDictionary(),
config.isOptimizeDictionaryForMetrics(), config.getNoDictionarySizeRatioThreshold(), spec, fieldIndexConfigs,
info.getDistinctValueCount(), info.getTotalNumberOfEntries())) {
config.isOptimizeDictionaryForMetrics(), config.getNoDictionarySizeRatioThreshold(),
config.getNoDictionaryCardinalityRatioThreshold(), spec, fieldIndexConfigs, info.getDistinctValueCount(),
info.getTotalNumberOfEntries())) {
// Ignore overrides and pick from config
createDictionary = info.isCreateDictionary();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,12 @@ public static boolean shouldUseVarLengthDictionary(FieldSpec.DataType columnStor
* This function evaluates whether to override dictionary (i.e use noDictionary)
* for a column even when its explicitly configured. This evaluation is for both dimension and metric
* column types.
*
* @return true if dictionary should be created, false if noDictionary should be used
*/
public static boolean ignoreDictionaryOverride(boolean optimizeDictionary,
boolean optimizeDictionaryForMetrics, double noDictionarySizeRatioThreshold,
FieldSpec fieldSpec, FieldIndexConfigs fieldIndexConfigs, int cardinality,
int totalNumberOfEntries) {
public static boolean ignoreDictionaryOverride(boolean optimizeDictionary, boolean optimizeDictionaryForMetrics,
double noDictionarySizeRatioThreshold, @Nullable Double noDictionaryCardinalityRatioThreshold,
FieldSpec fieldSpec, FieldIndexConfigs fieldIndexConfigs, int cardinality, int totalNumberOfEntries) {
// For an inverted index dictionary is required
if (fieldIndexConfigs.getConfig(StandardIndexes.inverted()).isEnabled()) {
return true;
Expand All @@ -236,22 +237,38 @@ public static boolean ignoreDictionaryOverride(boolean optimizeDictionary,
// Do not create dictionary if index size with dictionary is going to be larger than index size without dictionary
// This is done to reduce the cost of dictionary for high cardinality columns
// Off by default and needs optimizeDictionary to be set to true
if (fieldSpec.isSingleValueField() && fieldSpec.getDataType().isFixedWidth()) {
// if you can safely enable dictionary, you can ignore overrides
return canSafelyCreateDictionaryWithinThreshold(cardinality, totalNumberOfEntries,
noDictionarySizeRatioThreshold, fieldSpec);
if (fieldSpec.isSingleValueField()) {
return ignoreDictionaryOverrideForSingleValueFields(cardinality, totalNumberOfEntries,
noDictionarySizeRatioThreshold, noDictionaryCardinalityRatioThreshold, fieldSpec);
}
}
if (optimizeDictionaryForMetrics && !optimizeDictionary && fieldSpec.isSingleValueField()
&& fieldSpec.getFieldType() == FieldSpec.FieldType.METRIC) {
return ignoreDictionaryOverrideForSingleValueFields(cardinality, totalNumberOfEntries,
noDictionarySizeRatioThreshold, noDictionaryCardinalityRatioThreshold, fieldSpec);
}
return true;
}

if (optimizeDictionaryForMetrics && !optimizeDictionary) {
if (fieldSpec.isSingleValueField() && fieldSpec.getDataType().isFixedWidth() && fieldSpec.getFieldType()
== FieldSpec.FieldType.METRIC) {
/**
* Hold common logic for ignoring dictionary override for single value fields, used for dim and metric cols
*/
private static boolean ignoreDictionaryOverrideForSingleValueFields(int cardinality, int totalNumberOfEntries,
double noDictionarySizeRatioThreshold, Double noDictionaryCardinalityRatioThreshold, FieldSpec fieldSpec) {
if (fieldSpec.isSingleValueField()) {
if (fieldSpec.getDataType().isFixedWidth()) {
// if you can safely enable dictionary, you can ignore overrides
return canSafelyCreateDictionaryWithinThreshold(cardinality, totalNumberOfEntries,
noDictionarySizeRatioThreshold, fieldSpec);
}
// Config not set, default to old behavior of create dictionary for var width cols
if (noDictionaryCardinalityRatioThreshold == null) {
return true;
}
// Variable width type, so create based simply on cardinality threshold since size cannot be calculated easily
return noDictionaryCardinalityRatioThreshold * totalNumberOfEntries > cardinality;
}
return true;
return false;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,10 @@ Map<String, List<Operation>> computeOperations(SegmentDirectory.Reader segmentRe
if (existingColumnMetadata.getFieldSpec().getFieldType() != FieldSpec.FieldType.COMPLEX
&& DictionaryIndexType.ignoreDictionaryOverride(_tableConfig.getIndexingConfig().isOptimizeDictionary(),
_tableConfig.getIndexingConfig().isOptimizeDictionaryForMetrics(),
_tableConfig.getIndexingConfig().getNoDictionarySizeRatioThreshold(), existingColumnMetadata.getFieldSpec(),
_fieldIndexConfigs.get(column), existingColumnMetadata.getCardinality(),
existingColumnMetadata.getTotalNumberOfEntries())) {
_tableConfig.getIndexingConfig().getNoDictionarySizeRatioThreshold(),
_tableConfig.getIndexingConfig().getNoDictionaryCardinalityRatioThreshold(),
existingColumnMetadata.getFieldSpec(), _fieldIndexConfigs.get(column),
existingColumnMetadata.getCardinality(), existingColumnMetadata.getTotalNumberOfEntries())) {
columnOperationsMap.put(column, Collections.singletonList(Operation.ENABLE_DICTIONARY));
}
} else if (existingHasDict && !newIsDict) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.pinot.segment.local.segment.creator;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
import org.apache.pinot.segment.local.segment.creator.impl.SegmentCreationDriverFactory;
import org.apache.pinot.segment.spi.ImmutableSegment;
import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
import org.apache.pinot.segment.spi.creator.SegmentIndexCreationDriver;
import org.apache.pinot.segment.spi.creator.SegmentVersion;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.TableType;
import org.apache.pinot.spi.config.table.ingestion.IngestionConfig;
import org.apache.pinot.spi.data.DimensionFieldSpec;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.spi.data.readers.FileFormat;
import org.apache.pinot.spi.utils.ReadMode;
import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;


public class DictionaryOptimizerCardinalityTest {

private static final Logger LOGGER = LoggerFactory.getLogger(DictionaryOptimizerCardinalityTest.class);
private static final File INDEX_DIR = new File(DictionaryOptimizerCardinalityTest.class.toString());
private static File _segmentDirectory;
private static File _csvFile;

// Test cardinality based dictionary optimization for var-length data type columns
@Test
public void testDictionaryForMixedCardinalitiesStringType()
throws Exception {

ImmutableSegment heapSegment = ImmutableSegmentLoader.load(_segmentDirectory, ReadMode.heap);

Schema schema = heapSegment.getSegmentMetadata().getSchema();
for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
// Skip virtual columns
if (fieldSpec.isVirtualColumn()) {
continue;
}

String columnName = fieldSpec.getName();
if ("low_cardinality_strings".equals(columnName)) {
Assert.assertTrue(heapSegment.getForwardIndex(columnName).isDictionaryEncoded(),
"Low cardinality columns should be dictionary encoded");
}

if ("high_cardinality_strings".equals(columnName)) {
Assert.assertFalse(heapSegment.getForwardIndex(columnName).isDictionaryEncoded(),
"High cardinality columns should be raw encoded");
}
}
}

@BeforeClass
private void setup()
throws Exception {

if (INDEX_DIR.exists()) {
FileUtils.deleteQuietly(INDEX_DIR);
}
INDEX_DIR.mkdirs();
_csvFile = new File(INDEX_DIR, "data.csv");
generateCsv(_csvFile, 500);

IngestionConfig ingestionConfig = new IngestionConfig();
ingestionConfig.setRowTimeValueCheck(false);
ingestionConfig.setSegmentTimeValueCheck(false);
Schema schema =
new Schema.SchemaBuilder().addSingleValueDimension("low_cardinality_strings", FieldSpec.DataType.STRING)
.addSingleValueDimension("high_cardinality_strings", FieldSpec.DataType.STRING)
.addDateTimeField("ts", FieldSpec.DataType.LONG, "1:MILLISECONDS:EPOCH", "1:MILLISECONDS").build();
List<DimensionFieldSpec> stringColumns =
schema.getDimensionFieldSpecs().stream().filter(x -> x.getDataType() == FieldSpec.DataType.STRING).collect(
Collectors.toList());

List<FieldConfig> fieldConfigList = stringColumns.stream().map(
x -> new FieldConfig(x.getName(), FieldConfig.EncodingType.DICTIONARY, Collections.emptyList(), null, null))
.collect(Collectors.toList());

SegmentGeneratorConfig segmentGenSpec =
new SegmentGeneratorConfig(new TableConfigBuilder(TableType.OFFLINE).setTableName("tableName")
.setIngestionConfig(ingestionConfig).setFieldConfigList(fieldConfigList).build(),
schema);

segmentGenSpec.setInputFilePath(_csvFile.getAbsolutePath());
segmentGenSpec.setTimeColumnName("ts");
segmentGenSpec.setSegmentTimeUnit(TimeUnit.SECONDS);
segmentGenSpec.setFormat(FileFormat.CSV);
segmentGenSpec.setSegmentVersion(SegmentVersion.v1);
segmentGenSpec.setTableName("tableName");
segmentGenSpec.setOutDir(INDEX_DIR.getAbsolutePath());
segmentGenSpec.setOptimizeDictionary(true);
segmentGenSpec.setNoDictionaryCardinalityRatioThreshold(0.1); // cardinality must be <10% of total docs to override

SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
driver.init(segmentGenSpec);
driver.build();

_segmentDirectory = new File(INDEX_DIR, driver.getSegmentName());
}

// Generate a 3 columns csv file, sample format is:
// low_cardinality_strings,high_cardinality_strings,ts
// Red,kdeejdfnsd,1600000000
private void generateCsv(File file, int numberOfRows) throws IOException {
String[] lowCardinalityOptions = {"Red", "Blue", "Green", "Yellow", "Purple"};
String alphabet = "abcdefghijklmnopqrstuvwxyz";
Random random = new Random(42);

try (FileWriter writer = new FileWriter(file, false)) {
// Write the header
writer.append("low_cardinality_strings,high_cardinality_strings,ts\n");

long startTimestamp = System.currentTimeMillis() / 1000;
for (int i = 0; i < numberOfRows; i++) {
String lowCardinality = lowCardinalityOptions[random.nextInt(lowCardinalityOptions.length)];
StringBuilder highCardinality = new StringBuilder(10);
for (int j = 0; j < 10; j++) {
highCardinality.append(alphabet.charAt(random.nextInt(alphabet.length())));
}
long timestamp = startTimestamp + (i / 10);
writer.append(String.format("%s,%s,%d\n", lowCardinality, highCardinality, timestamp));
}
}
}

@AfterClass
public void cleanup() {
FileUtils.deleteQuietly(_csvFile);
FileUtils.deleteQuietly(_segmentDirectory);
FileUtils.deleteQuietly(INDEX_DIR);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@
import org.apache.pinot.segment.local.segment.index.AbstractSerdeIndexContract;
import org.apache.pinot.segment.spi.index.DictionaryIndexConfig;
import org.apache.pinot.segment.spi.index.FieldIndexConfigs;
import org.apache.pinot.segment.spi.index.IndexType;
import org.apache.pinot.segment.spi.index.StandardIndexes;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.IndexConfig;
import org.apache.pinot.spi.config.table.Intern;
import org.apache.pinot.spi.config.table.JsonIndexConfig;
import org.apache.pinot.spi.data.DimensionFieldSpec;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.MetricFieldSpec;
import org.apache.pinot.spi.utils.JsonUtils;
import org.mockito.Mockito;
import org.testng.Assert;
import org.testng.annotations.Test;

Expand Down Expand Up @@ -349,28 +350,60 @@ public void testStandardIndex() {
+ "the DictionaryIndexType static instance");
}

/**
* Tests to verify various combinations of inputs to test dictionary override optimization.
*/
@Test
public void testDictionaryOverride() {
MetricFieldSpec metric = new MetricFieldSpec("testCol", FieldSpec.DataType.DOUBLE);
FieldIndexConfigs fieldIndexConfigs = new FieldIndexConfigs.Builder().build();
IndexType index1 = Mockito.mock(IndexType.class);
Mockito.when(index1.getId()).thenReturn("index1");
IndexConfig indexConf = new IndexConfig(true);
FieldIndexConfigs fieldIndexConfigs = new FieldIndexConfigs.Builder().add(index1, indexConf).build();
// No need to disable dictionary
assertTrue(DictionaryIndexType.ignoreDictionaryOverride(false, true, 2, metric, fieldIndexConfigs, 5, 20));
boolean result =
DictionaryIndexType.ignoreDictionaryOverride(false, true, 2, null, metric, fieldIndexConfigs, 5, 20);
assertTrue(result);

// Set a higher noDictionarySizeRatioThreshold
assertFalse(DictionaryIndexType.ignoreDictionaryOverride(false, true, 5, metric, fieldIndexConfigs, 5, 20));
result = DictionaryIndexType.ignoreDictionaryOverride(false, true, 5, null, metric, fieldIndexConfigs, 5, 20);
assertFalse(result);

// optimizeDictionary and optimizeDictionaryForMetrics both turned on
assertFalse(DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, metric, fieldIndexConfigs, 5, 20));
result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, null, metric, fieldIndexConfigs, 5, 20);
assertFalse(result);

// noDictionarySizeRatioThreshold and noDictionaryCardinalityThreshold are provided
result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, 0.10, metric, fieldIndexConfigs, 5, 100);
assertTrue(result);

// cardinality is much less than total docs, use dictionary
metric.setDataType(FieldSpec.DataType.STRING);
result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, 0.10, metric, fieldIndexConfigs, 5, 100);
assertTrue(result);

// cardinality is large % of total docs, do not use dictionary
result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, 0.10, metric, fieldIndexConfigs, 5, 20);
assertFalse(result);

// Test Dimension col
// Don't ignore for Json. We want to disable dictionary for json.
DimensionFieldSpec dimension = new DimensionFieldSpec("testCol", FieldSpec.DataType.JSON, true);
result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, null, dimension, fieldIndexConfigs, 5, 20);
assertTrue(result);

// cardinality is much less than total docs, use dictionary
dimension.setDataType(FieldSpec.DataType.STRING);
result = DictionaryIndexType.ignoreDictionaryOverride(true, false, 5, 0.10, dimension, fieldIndexConfigs, 5, 100);
assertTrue(result);

// cardinality is large % of total docs, do not use dictionary
result = DictionaryIndexType.ignoreDictionaryOverride(true, false, 5, 0.10, dimension, fieldIndexConfigs, 5, 20);
assertFalse(result);

// Ignore for inverted index
IndexConfig indexConfig = new IndexConfig(false);
fieldIndexConfigs = new FieldIndexConfigs.Builder().add(StandardIndexes.inverted(), indexConfig).build();
assertTrue(DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, metric, fieldIndexConfigs, 5, 20));

// Don't ignore for JSON index
DimensionFieldSpec dimension = new DimensionFieldSpec("testCol", FieldSpec.DataType.JSON, true);
JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
fieldIndexConfigs = new FieldIndexConfigs.Builder().add(StandardIndexes.json(), jsonIndexConfig).build();
assertFalse(DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, dimension, fieldIndexConfigs, 5, 20));
assertTrue(DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, null, metric, fieldIndexConfigs, 5, 20));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ public enum TimeColumnType {
private boolean _optimizeDictionary = false;
private boolean _optimizeDictionaryForMetrics = false;
private double _noDictionarySizeRatioThreshold = IndexingConfig.DEFAULT_NO_DICTIONARY_SIZE_RATIO_THRESHOLD;
private Double _noDictionaryCardinalityRatioThreshold;
private boolean _realtimeConversion = false;
// consumerDir contains data from the consuming segment, and is used during _realtimeConversion optimization
private File _consumerDir;
Expand Down Expand Up @@ -208,6 +209,7 @@ public SegmentGeneratorConfig(TableConfig tableConfig, Schema schema) {
_optimizeDictionary = indexingConfig.isOptimizeDictionary();
_optimizeDictionaryForMetrics = indexingConfig.isOptimizeDictionaryForMetrics();
_noDictionarySizeRatioThreshold = indexingConfig.getNoDictionarySizeRatioThreshold();
_noDictionaryCardinalityRatioThreshold = indexingConfig.getNoDictionaryCardinalityRatioThreshold();
}

IngestionConfig ingestionConfig = tableConfig.getIngestionConfig();
Expand Down Expand Up @@ -805,6 +807,16 @@ public void setNoDictionarySizeRatioThreshold(double noDictionarySizeRatioThresh
_noDictionarySizeRatioThreshold = noDictionarySizeRatioThreshold;
}

@Nullable
public Double getNoDictionaryCardinalityRatioThreshold() {
return _noDictionaryCardinalityRatioThreshold;
}

public void setNoDictionaryCardinalityRatioThreshold(@Nullable Double noDictionaryCardinalityRatioThreshold) {
_noDictionaryCardinalityRatioThreshold = noDictionaryCardinalityRatioThreshold;
}


public boolean isFailOnEmptySegment() {
return _failOnEmptySegment;
}
Expand Down
Loading

0 comments on commit afb63bc

Please sign in to comment.