diff --git a/pom.xml b/pom.xml index b534795b..4165d875 100644 --- a/pom.xml +++ b/pom.xml @@ -145,6 +145,11 @@ junit-jupiter-engine test + + org.junit.jupiter + junit-jupiter-params + test + diff --git a/src/main/java/datawave/query/model/FieldIndexHole.java b/src/main/java/datawave/query/model/FieldIndexHole.java new file mode 100644 index 00000000..a83d85f9 --- /dev/null +++ b/src/main/java/datawave/query/model/FieldIndexHole.java @@ -0,0 +1,85 @@ +package datawave.query.model; + +import java.util.Collection; +import java.util.Comparator; +import java.util.Date; +import java.util.Objects; +import java.util.SortedSet; +import java.util.StringJoiner; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; + +import com.google.common.collect.ImmutableSortedSet; + +/** + * This class represents a set of calculated field index holes for a given fieldName and datatype. A field index hole is effectively a date where a frequency + * row was seen, but an index and/or reversed indexed row was not. + */ +public class FieldIndexHole { + + private final String fieldName; + private final String datatype; + private final SortedSet> dateRanges; + + public FieldIndexHole(String fieldName, String dataType, Collection> holes) { + this.fieldName = fieldName; + this.datatype = dataType; + // Ensure the date range set is immutable. + ImmutableSortedSet.Builder> builder = new ImmutableSortedSet.Builder<>(Comparator.naturalOrder()); + holes.forEach(p -> builder.add(new ImmutablePair<>(p.getLeft(), p.getRight()))); + dateRanges = builder.build(); + } + + /** + * Return the field name. + * + * @return the field name. + */ + public String getFieldName() { + return fieldName; + } + + /** + * Return the datatype. + * + * @return the datatype. + */ + public String getDatatype() { + return datatype; + } + + /** + * Returns the set of date ranges that span over field index holes for the fieldName and datatype of this {@link FieldIndexHole}. Each date range represents + * a span of consecutive days for which a frequency row exist, but an index row does not. All date ranges are start(inclusive)-end(inclusive). + * + * @return the date ranges + */ + public SortedSet> getDateRanges() { + return dateRanges; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + FieldIndexHole indexHole = (FieldIndexHole) o; + return Objects.equals(fieldName, indexHole.fieldName) && Objects.equals(datatype, indexHole.datatype) + && Objects.equals(dateRanges, indexHole.dateRanges); + } + + @Override + public int hashCode() { + return Objects.hash(fieldName, datatype, dateRanges); + } + + @Override + public String toString() { + return new StringJoiner(", ", FieldIndexHole.class.getSimpleName() + "[", "]").add("fieldName='" + fieldName + "'").add("dataType='" + datatype + "'") + .add("dateRanges=" + dateRanges).toString(); + } +} diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index fd535ee2..7dd56289 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -3,9 +3,11 @@ import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; +import java.nio.charset.CharacterCodingException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -16,6 +18,8 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.concurrent.ExecutionException; import org.apache.accumulo.core.client.AccumuloClient; @@ -27,6 +31,7 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.RegExFilter; import org.apache.accumulo.core.security.Authorizations; +import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.slf4j.Logger; @@ -40,6 +45,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; @@ -50,8 +56,10 @@ import datawave.data.type.Type; import datawave.query.composite.CompositeMetadata; import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.FieldIndexHole; import datawave.security.util.AuthorizationsMinimizer; import datawave.security.util.ScannerHelper; +import datawave.util.time.DateHelper; @EnableCaching @Component("allFieldMetadataHelper") @@ -1050,6 +1058,355 @@ public Set loadDatatypes() throws TableNotFoundException { return Collections.unmodifiableSet(datatypes); } + /** + * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all indexed entries. The map consists of + * field names to datatypes to field index holes. + * + * @return a map of field names and datatype pairs to field index holes + */ + @Cacheable(value = "getFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") + public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + return getFieldIndexHoles(ColumnFamilyConstants.COLF_I); + } + + /** + * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all reversed indexed entries. The map + * consists of field names to datatypes to field index holes. + * + * @return a map of field names and datatype pairs to field index holes + */ + @Cacheable(value = "getReversedFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") + public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + return getFieldIndexHoles(ColumnFamilyConstants.COLF_RI); + } + + /** + * Supplies field index hole for {@link #getFieldIndexHoles()} and {@link #getReversedFieldIndexHoles()}. + */ + private Map> getFieldIndexHoles(Text indexColumnFamily) throws TableNotFoundException { + log.debug("cache fault for getFieldIndexHoles(" + this.auths + "," + this.metadataTableName + ")"); + + Scanner bs = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths); + + // Fetch the frequency column and the specified index column. + bs.fetchColumnFamily(ColumnFamilyConstants.COLF_F); + bs.fetchColumnFamily(indexColumnFamily); + + // For all keys in the DatawaveMetadata table. + bs.setRange(new Range()); + + // We must first scan over the fieldName-datatype combinations and extract the date ranges in which we've seen them. Each date range represents a span + // of time when we saw an event for each day in that date range, from the start (inclusive) to end (inclusive). + Map>> frequencyMap = new HashMap<>(); + Map>> indexMap = new HashMap<>(); + Calendar calendar = Calendar.getInstance(); + + String prevFieldName = null; + String prevDatatype = null; + Date prevDate = null; + Date startDate = null; + Text prevColumnFamily = null; + + // Points to the target map object that we add date ranges to. This changes when we see a different column family compared to the previous row. We must + // initially start adding entries to the frequency map. + Map>> dateMap = frequencyMap; + + Map>> fieldIndexHoles = new HashMap<>(); + + // Scan each row and extract the date ranges. + for (Entry entry : bs) { + Key key = entry.getKey(); + String fieldName = key.getRow().toString(); + Text columnFamily = key.getColumnFamily(); + + // Parse the data type and event date from the column qualifier. + String cq = key.getColumnQualifier().toString(); + int offset = cq.indexOf(NULL_BYTE); + String datatype = cq.substring(0, offset); + Date date = DateHelper.parse(cq.substring((offset + 1))); + + // If this is the very first entry we've seen, update the tracking variables and continue to the next entry. + if (prevFieldName == null) { + prevFieldName = fieldName; + prevDatatype = datatype; + prevDate = date; + startDate = date; + prevColumnFamily = columnFamily; + continue; + } + + // The column family is different. We have two possible scenarios: + // - The previous column family was 'f'. The current row is an index row for to the current field. + // - The previous column family was the target index column family. The current row is an 'f' row for a new field. + // + // In both cases, record the last date range, and begin collecting date ranges for the next batch of related rows. + if (!prevColumnFamily.equals(columnFamily)) { + // Add the latest date range seen for the previous fieldName-datatype combination. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + + // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for the + // previously collected data. + if (columnFamily.equals(ColumnFamilyConstants.COLF_F)) { + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + // Clear the date range maps. + frequencyMap.clear(); + indexMap.clear(); + // Set the target date map to the frequency map. + dateMap = frequencyMap; + } else { + // The current column family is the target index. Add the latest date range seen for the previous datatype. + dateMap = indexMap; + } + // Update our tracking variables. + prevFieldName = fieldName; + prevDatatype = datatype; + startDate = date; + } else { + // The column family is the same. We have three possible scenarios: + // - A row with a field that is different to the previous field. + // - A row with the same field and datatype. + // - A row with the same field, but a different datatype. + // + // We have encountered a new field name and the previous fieldName-datatype combination did not have any corresponding index row entries. + if (!fieldName.equals(prevFieldName)) { + // Add the latest date range seen for the previous fieldName. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + // Add the field index holes for the previous field name. + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + // Clear the date range maps. + frequencyMap.clear(); + indexMap.clear(); + // Update our tracking variables. + prevFieldName = fieldName; + prevDatatype = datatype; + startDate = date; + } else if (datatype.equals(prevDatatype)) { + // We are on the same fieldName-datatype combination as the previous row. Determine if we can add a date-range. + calendar.setTime(prevDate); + calendar.add(Calendar.DATE, 1); + // If the current date is not one day after the previous date, it is not a continuous part of the previously tracked date range. Save the + // previous date range and begin a new one. + if (!calendar.getTime().equals(date)) { + // The current date should not be included in the current date range. Add the current date range, and start a new one. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(datatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + + // Update the date tracking variables. + startDate = date; + } + } else { + // We've encountered a new datatype. Add the latest date range seen for the previous datatype. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + + // Update our tracking variables. + prevDatatype = datatype; + startDate = date; + } + } + // Update the previous date and column family. + prevDate = date; + prevColumnFamily = columnFamily; + } + + // After there are no more rows, ensure that we record the last date range for the last fieldName-datatype combination that we saw. + Pair dateRange = Pair.of(startDate, prevDate); + SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); + dates.add(dateRange); + + // Get the field index holes for the previous field name. + Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); + fieldIndexHoles.put(prevFieldName, datatypeHoles); + + // Create immutable versions of the field index holes, and do not retain any empty collections. + ImmutableMap.Builder> fieldMapBuilder = new ImmutableMap.Builder<>(); + for (String fieldName : fieldIndexHoles.keySet()) { + Multimap> datatypeMap = fieldIndexHoles.get(fieldName); + if (!datatypeMap.isEmpty()) { + ImmutableMap.Builder datatypeMapBuilder = new ImmutableMap.Builder<>(); + for (String datatype : datatypeMap.keySet()) { + FieldIndexHole fieldIndexHole = new FieldIndexHole(fieldName, datatype, datatypeMap.get(datatype)); + datatypeMapBuilder.put(datatype, fieldIndexHole); + } + fieldMapBuilder.put(fieldName, datatypeMapBuilder.build()); + } + } + + // Return the finalized field index holes. + return fieldMapBuilder.build(); + } + + private Multimap> getFieldIndexHoles(Map>> frequencyMap, + Map>> indexMap) { + // New tracking variables. + String prevDataType = null; + Pair prevFrequencyDateRange = null; + Date holeStartDate = null; + Multimap> fieldIndexHoles = HashMultimap.create(); + Calendar calendar = Calendar.getInstance(); + + // Compare the date ranges for each datatype to identify any and all field index holes. Evaluate the date ranges for each datatype. + for (String datatype : frequencyMap.keySet()) { + // If holeStartDate is not null, we have a hole left over from the previous datatype combination. The index hole spans from the hole + // start date to the end of the last frequency date range. + if (holeStartDate != null) { + fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + holeStartDate = null; + } + + // At least one corresponding index row was seen. Compare the date ranges to identify any index holes. + if (indexMap.containsKey(datatype)) { + SortedSet> frequencyDates = frequencyMap.get(datatype); + Iterator> indexDatesIterator = indexMap.get(datatype).iterator(); + Pair prevIndexDateRange = null; + boolean comparePrevIndexDateRange = false; + // Evaluate each date range we saw for frequency rows for the current fieldName-datatype. + for (Pair frequencyDateRange : frequencyDates) { + Date frequencyStartDate = frequencyDateRange.getLeft(); + Date frequencyEndDate = frequencyDateRange.getRight(); + + // If it's been flagged that we need to compare the previous index date range to the current frequency date range, do so. This is done when + // we potentially have an index hole that spans over the end of the previous frequency date range and the start of the next frequency date + // range. + if (comparePrevIndexDateRange) { + Date indexStartDate = prevIndexDateRange.getLeft(); + Date indexEndDate = prevIndexDateRange.getRight(); + + // If holeStartDate is not null, we have an index hole left over from the previous frequency date range. The index hole spans from the + // hole start date to the end of the last frequency date range. + if (holeStartDate != null) { + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + holeStartDate = null; + } + + // The index start date is equal to the frequency start date. Check for a hole. + if (indexStartDate.equals(frequencyStartDate)) { + if (!indexEndDate.equals(frequencyEndDate)) { + // There is an index hole starting the day after the index end date. We must evaluate the next index date range to determine the + // end date of the index hole. + calendar.setTime(indexEndDate); + calendar.add(Calendar.DATE, 1); + holeStartDate = calendar.getTime(); + } + // Otherwise there is no index hole here. + } else { + // The index start date is after the frequency start date. Check if we have a hole that partially covers the frequency date range, + // or all of it. + if (indexStartDate.before(frequencyEndDate)) { + // There is an index hole starting on the frequency start date, and ending the day before the index start date. + calendar.setTime(indexStartDate); + calendar.add(Calendar.DATE, -1); + fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); + + if (indexEndDate.before(frequencyEndDate)) { + // There is an index hole starting the day after the index end date. We must evaluate the next index date range to determine + // the end date of the index hole. + calendar.setTime(indexEndDate); + calendar.add(Calendar.DATE, 1); + holeStartDate = calendar.getTime(); + } + } else { + // The entire frequency date range is an index hole. Add it as such, and continue to the next frequency date range. We want to + // compare the current index date range to the next frequency date range as well. + fieldIndexHoles.put(datatype, frequencyDateRange); + continue; + } + } + comparePrevIndexDateRange = false; + } + + // Evaluate each index date range against the current frequency date range. If we see an index date range that begins after the current + // frequency date range, we will skip to the next frequency date range. + while (indexDatesIterator.hasNext()) { + Pair indexDateRange = indexDatesIterator.next(); + Date indexStartDate = indexDateRange.getLeft(); + Date indexEndDate = indexDateRange.getRight(); + + if (indexStartDate.equals(frequencyStartDate)) { + if (indexEndDate.equals(frequencyEndDate)) { + // The current index date range is equal to the current frequency date rang, and there is no index hole for the current + // frequency date range. Break out of the loop and continue to the next frequency date range. + prevIndexDateRange = indexDateRange; + break; + } else { + // There is an index hole starting the day after the index end date. Mark the start date, and continue to the next index date + // range to determine the end date. + calendar.setTime(indexEndDate); + calendar.add(Calendar.DATE, 1); + holeStartDate = calendar.getTime(); + } + } else if (indexStartDate.before(frequencyEndDate)) { + calendar.setTime(indexStartDate); + calendar.add(Calendar.DATE, -1); + if (holeStartDate != null) { + // If holeStartDate is not null, we've previously identified the start of an index hole that is not the start of the frequency + // date range. There is an index hole from holeStartDate to the day before the index start date. + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, calendar.getTime())); + holeStartDate = null; + } else { + // There is an index hole from the frequency start date to the day before the index start date. + fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); + } + + // It's possible for the current index date range to end before the current frequency date range. If so, this indicates a new index + // hole. + if (indexEndDate.before(frequencyEndDate)) { + // There is an index hole starting the day after the index end date. We need to evaluate the next index date range to determine + // the end of the index hole. Mark the start of this new index hole. + calendar.setTime(indexEndDate); + calendar.add(Calendar.DATE, 1); + holeStartDate = calendar.getTime(); + } + } else { + // The start of the current index date range occurs after the current frequency date range. There is a hole in the current frequency + // date range. + if (holeStartDate == null) { + // The entire current frequency date range is an index hole. Add it as such and break out to continue to the next frequency + // date range. + fieldIndexHoles.put(datatype, frequencyDateRange); + break; + } else { + // There is an index hole from the recorded hole start date to the end of the frequency date range. Add it as such and break + // out to continue to the next frequency date range. + fieldIndexHoles.put(datatype, Pair.of(holeStartDate, frequencyEndDate)); + holeStartDate = null; + // The current index date range is entirely after the current frequency date range. As such, we need to compare the current + // index date range to the next frequency date range. + comparePrevIndexDateRange = true; + } + } + // Update the prev index date range. + prevIndexDateRange = indexDateRange; + } + // Update the prev frequency date range. + prevFrequencyDateRange = frequencyDateRange; + } + + } else { + // No corresponding index rows were seen for any of the frequency rows. Each date range represents an index hole. + fieldIndexHoles.putAll(datatype, frequencyMap.get(datatype)); + } + // Update the prev datatype. + prevDataType = datatype; + } + + // If we have a non-null hole start date after processing all the date ranges, we have an index hole that ends at the last frequency date range seen + // for the last fieldName-datatype combination. + if (holeStartDate != null) { + fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + } + + return fieldIndexHoles; + } + private static String getKey(String instanceID, String metadataTableName) { StringBuilder builder = new StringBuilder(); builder.append(instanceID).append('\0'); diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index ffccd766..2b5e1d10 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -71,6 +71,7 @@ import datawave.marking.MarkingFunctions; import datawave.query.composite.CompositeMetadata; import datawave.query.model.Direction; +import datawave.query.model.FieldIndexHole; import datawave.query.model.FieldMapping; import datawave.query.model.ModelKeyParser; import datawave.query.model.QueryModel; @@ -1415,6 +1416,24 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri return date; } + /** + * Return the field index holes calculated between all "i" and "f" entries. The map consists of field names to datatypes to field index holes. + * + * @return the field index holes + */ + public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + return allFieldMetadataHelper.getFieldIndexHoles(); + } + + /** + * Return the field index holes calculated between all "ri" and "f" entries. The map consists of field names to datatypes to field index holes. + * + * @return the field index holes + */ + public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { + return allFieldMetadataHelper.getReversedFieldIndexHoles(); + } + /** * Updates the table cache via the mock connector with the given entry and writer. If writer is null, a writer will be created and returned for subsequent * use. diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java new file mode 100644 index 00000000..ebf2c561 --- /dev/null +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -0,0 +1,440 @@ +package datawave.query.util; + +import java.io.File; +import java.net.URISyntaxException; +import java.nio.charset.CharacterCodingException; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Supplier; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.client.TableExistsException; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.user.SummingCombiner; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import datawave.accumulo.inmemory.InMemoryAccumuloClient; +import datawave.accumulo.inmemory.InMemoryInstance; +import datawave.query.composite.CompositeMetadataHelper; +import datawave.query.model.FieldIndexHole; +import datawave.util.time.DateHelper; + +class AllFieldMetadataHelperTest { + + private static final String TABLE_METADATA = "metadata"; + private static final String[] AUTHS = {"FOO"}; + private static final String NULL_BYTE = "\0"; + private static final Value NULL_VALUE = new Value(new byte[0]); + private AccumuloClient accumuloClient; + private AllFieldMetadataHelper helper; + + @BeforeAll + static void beforeAll() throws URISyntaxException { + File dir = new File(Objects.requireNonNull(ClassLoader.getSystemClassLoader().getResource(".")).toURI()); + File targetDir = dir.getParentFile(); + System.setProperty("hadoop.home.dir", targetDir.getAbsolutePath()); + } + + /** + * Set up the accumulo client and initialize the helper. + */ + @BeforeEach + void setUp() throws AccumuloSecurityException, AccumuloException, TableExistsException { + accumuloClient = new InMemoryAccumuloClient("root", new InMemoryInstance(AllFieldMetadataHelper.class.toString())); + if (!accumuloClient.tableOperations().exists(TABLE_METADATA)) { + accumuloClient.tableOperations().create(TABLE_METADATA); + } + final Set allMetadataAuths = Collections.emptySet(); + final Set auths = Collections.singleton(new Authorizations(AUTHS)); + TypeMetadataHelper typeMetadataHelper = new TypeMetadataHelper(Maps.newHashMap(), allMetadataAuths, accumuloClient, TABLE_METADATA, auths, false); + CompositeMetadataHelper compositeMetadataHelper = new CompositeMetadataHelper(accumuloClient, TABLE_METADATA, auths); + helper = new AllFieldMetadataHelper(typeMetadataHelper, compositeMetadataHelper, accumuloClient, TABLE_METADATA, auths, allMetadataAuths); + } + + /** + * Clear the metadata table after each test. + */ + @AfterEach + void tearDown() throws AccumuloException, TableNotFoundException, AccumuloSecurityException { + accumuloClient.tableOperations().deleteRows(TABLE_METADATA, null, null); + } + + /** + * Write the given mutations to the metadata table. + */ + private void writeMutations(Collection mutations) { + BatchWriterConfig config = new BatchWriterConfig(); + config.setMaxMemory(0); + try (BatchWriter writer = accumuloClient.createBatchWriter(TABLE_METADATA, config)) { + writer.addMutations(mutations); + writer.flush(); + } catch (MutationsRejectedException | TableNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles()}. + */ + @Nested + public class FieldIndexHoleTests { + + private final Supplier>> INDEX_FUNCTION = () -> { + try { + return helper.getFieldIndexHoles(); + } catch (TableNotFoundException | CharacterCodingException e) { + throw new RuntimeException(e); + } + }; + + private final Supplier>> REVERSED_INDEX_FUNCTION = () -> { + try { + return helper.getReversedFieldIndexHoles(); + } catch (TableNotFoundException | CharacterCodingException e) { + throw new RuntimeException(e); + } + }; + + private Supplier>> getIndexHoleFunction(String cf) { + return cf.equals("i") ? INDEX_FUNCTION : REVERSED_INDEX_FUNCTION; + } + + /** + * Test against data that has no field index holes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testNoFieldIndexHoles(String cf) { + // Create a series of frequency rows over date ranges, each with a matching index row for each date. + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120"); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120"); + mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120"); + writeMutations(mutationCreator.getMutations()); + + // Verify that no index holes were found. + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + Assertions.assertTrue(fieldIndexHoles.isEmpty()); + } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); // Do not create matching index rows for these. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); + mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200105"), dateRange("20200110", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where everything is an index hole. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testAllDatesAreIndexHoles(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles(String cf) { + MutationCreator mutationCreator = new MutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104"); + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112"); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115"); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121"); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125"); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315"); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328"); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + } + + private Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + private FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + } + + private Pair dateRange(String start, String end) { + return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); + } + + /** + * Helper class for creating mutations in bulk. + */ + private static class MutationCreator { + + private final List mutations = new ArrayList<>(); + + private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate) { + List dates = getDatesInRange(startDate, endDate); + dates.forEach(date -> addFrequencyMutation(fieldName, datatype, date)); + } + + private void addFrequencyMutation(String fieldName, String datatype, String date) { + addMutation(fieldName, "f", datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); + } + + private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate) { + List dates = getDatesInRange(startDate, endDate); + dates.forEach(date -> addIndexMutation(cf, fieldName, datatype, date)); + } + + private void addIndexMutation(String cf, String fieldName, String datatype, String date) { + addMutation(fieldName, cf, datatype + NULL_BYTE + date, NULL_VALUE); + } + + private List getDatesInRange(String startDateStr, String endDateStr) { + Date startDate = DateHelper.parse(startDateStr); + Date endDate = DateHelper.parse(endDateStr); + + List dates = new ArrayList<>(); + dates.add(startDateStr); + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(startDate); + while (true) { + calendar.add(Calendar.DAY_OF_MONTH, 1); + Date date = calendar.getTime(); + if (date.before(endDate) || date.equals(endDate)) { + dates.add(DateHelper.format(date)); + } else { + break; + } + } + + return dates; + } + + private void addMutation(String row, String columnFamily, String columnQualifier, Value value) { + Mutation mutation = new Mutation(row); + mutation.put(columnFamily, columnQualifier, value); + mutations.add(mutation); + } + + private List getMutations() { + return mutations; + } + } +}