diff --git a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java index 7dd56289..d844fe77 100644 --- a/src/main/java/datawave/query/util/AllFieldMetadataHelper.java +++ b/src/main/java/datawave/query/util/AllFieldMetadataHelper.java @@ -3,11 +3,9 @@ import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; -import java.nio.charset.CharacterCodingException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; -import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -18,7 +16,9 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.SortedMap; import java.util.SortedSet; +import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ExecutionException; @@ -1062,349 +1062,366 @@ public Set loadDatatypes() throws TableNotFoundException { * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all indexed entries. The map consists of * field names to datatypes to field index holes. * + * @param fields + * the fields to fetch field index holes for, an empty set will result in all fields being fetched + * @param datatypes + * the datatypes to fetch field index holes for, an empty set will result in all datatypes being fetched + * @param minThreshold + * the minimum percentage threshold required for an index row to be considered NOT a hole on a particular date, this should be a value in the + * range 0.0 to 1.0 * @return a map of field names and datatype pairs to field index holes */ - @Cacheable(value = "getFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") - public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { - return getFieldIndexHoles(ColumnFamilyConstants.COLF_I); + public Map> getFieldIndexHoles(Set fields, Set datatypes, double minThreshold) + throws TableNotFoundException, IOException { + return getFieldIndexHoles(ColumnFamilyConstants.COLF_I, fields, datatypes, minThreshold); } /** * Fetches results from {@link #metadataTableName} and calculates the set of field index holes that exists for all reversed indexed entries. The map * consists of field names to datatypes to field index holes. * + * @param fields + * the fields to fetch field index holes for, an empty set will result in all fields being fetched + * @param datatypes + * the datatypes to fetch field index holes for, an empty set will result in all datatypes being fetched + * @param minThreshold + * the minimum percentage threshold required for an index row to be considered NOT a hole on a particular date, this should be a value in the + * range 0.0 to 1.0 * @return a map of field names and datatype pairs to field index holes */ - @Cacheable(value = "getReversedFieldIndexHoles", key = "{#root.target.auths,#root.target.metadataTableName}", cacheManager = "metadataHelperCacheManager") - public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { - return getFieldIndexHoles(ColumnFamilyConstants.COLF_RI); + public Map> getReversedFieldIndexHoles(Set fields, Set datatypes, double minThreshold) + throws TableNotFoundException, IOException { + return getFieldIndexHoles(ColumnFamilyConstants.COLF_RI, fields, datatypes, minThreshold); } - /** - * Supplies field index hole for {@link #getFieldIndexHoles()} and {@link #getReversedFieldIndexHoles()}. - */ - private Map> getFieldIndexHoles(Text indexColumnFamily) throws TableNotFoundException { - log.debug("cache fault for getFieldIndexHoles(" + this.auths + "," + this.metadataTableName + ")"); + private Map> getFieldIndexHoles(Text targetColumnFamily, Set fields, Set datatypes, double minThreshold) + throws TableNotFoundException, IOException { + // Handle null fields if given. + if (fields == null) { + fields = Collections.emptySet(); + } else { + // Ensure null is not present as an entry. + fields.remove(null); + } + + // Handle null datatypes if given. + if (datatypes == null) { + datatypes = Collections.emptySet(); + } else { + // Ensure null is not present as an entry. + datatypes.remove(null); + } + + // Ensure the minThreshold is a percentage in the range 0%-100%. + if (minThreshold > 1.0d) { + minThreshold = 1.0d; + } else if (minThreshold < 0.0d) { + minThreshold = 0.0d; + } Scanner bs = ScannerHelper.createScanner(accumuloClient, metadataTableName, auths); // Fetch the frequency column and the specified index column. bs.fetchColumnFamily(ColumnFamilyConstants.COLF_F); - bs.fetchColumnFamily(indexColumnFamily); + bs.fetchColumnFamily(targetColumnFamily); + + // Determine which range to use. + Range range; + if (fields.isEmpty()) { + // If no fields are specified, scan over all entries in the table. + range = new Range(); + } else if (fields.size() == 1) { + // If just one field is specified, limit the range to where the row is the field. + range = new Range(new Text(fields.iterator().next())); + } else { + // If more than one field is specified, sort the fields and limit the range from the lowest to highest field (lexicographically). + SortedSet sortedFields = new TreeSet<>(fields); + range = new Range(new Text(sortedFields.first()), new Text(sortedFields.last())); + } + bs.setRange(range); - // For all keys in the DatawaveMetadata table. - bs.setRange(new Range()); + FieldIndexHoleFinder finder = new FieldIndexHoleFinder(bs, minThreshold, fields, datatypes); + return finder.findHoles(); + } + + /** + * Utility class for finding field index holes. + */ + private static class FieldIndexHoleFinder { - // We must first scan over the fieldName-datatype combinations and extract the date ranges in which we've seen them. Each date range represents a span - // of time when we saw an event for each day in that date range, from the start (inclusive) to end (inclusive). - Map>> frequencyMap = new HashMap<>(); - Map>> indexMap = new HashMap<>(); - Calendar calendar = Calendar.getInstance(); + private final Scanner scanner; + private final double minThreshold; + private final Set fields; + private final Set datatypes; + private final boolean filterFields; + private final boolean filterDatatypes; - String prevFieldName = null; - String prevDatatype = null; - Date prevDate = null; - Date startDate = null; - Text prevColumnFamily = null; + // Contains datatypes to dates and counts for entries seen in "f" rows for the current field name. + private final Map> frequencyMap = new HashMap<>(); - // Points to the target map object that we add date ranges to. This changes when we see a different column family compared to the previous row. We must - // initially start adding entries to the frequency map. - Map>> dateMap = frequencyMap; + // Contains datatypes to dates and counts for entries seen in the target "i" or "ri" index rows for the current field name. + private final Map> indexMap = new HashMap<>(); + // Points to the target map object that we add entries to. This changes when we see a different column family compared to the previous row when scanning + // over entries. We must initially start adding entries to the frequency map. + private Map> targetMap = frequencyMap; + + // Map of field names to maps of datatypes to date ranges encompassing field index holes. Map>> fieldIndexHoles = new HashMap<>(); - // Scan each row and extract the date ranges. - for (Entry entry : bs) { - Key key = entry.getKey(); - String fieldName = key.getRow().toString(); - Text columnFamily = key.getColumnFamily(); + FieldIndexHoleFinder(Scanner scanner, double minThreshold, Set fields, Set datatypes) { + this.scanner = scanner; + this.minThreshold = minThreshold; + this.fields = Collections.unmodifiableSet(fields); + this.datatypes = Collections.unmodifiableSet(datatypes); + // Actively filter out entries based on the field if we have more than one field specified. If we have an empty set, we are searching for field + // index holes for all fields. If we have just one field, the range for the scanner will already be limited to just the field. + this.filterFields = fields.size() > 1; + // Actively filter out entries based on the datatypes if we have any datatypes specified. If we have an empty set, we are searching for field index + // holes for all datatypes. + this.filterDatatypes = !datatypes.isEmpty(); + } + + /** + * Find and return all field index holes for the scanner in this {@link FieldIndexHoleFinder}. + * + * @return the field index holes + * @throws IOException + */ + Map> findHoles() throws IOException { + String prevFieldName = null; + Text prevColumnFamily = null; - // Parse the data type and event date from the column qualifier. - String cq = key.getColumnQualifier().toString(); - int offset = cq.indexOf(NULL_BYTE); - String datatype = cq.substring(0, offset); - Date date = DateHelper.parse(cq.substring((offset + 1))); + String currFieldName; + String currDatatype; + Text currColumnFamily; + Date currDate; + Long currCount; - // If this is the very first entry we've seen, update the tracking variables and continue to the next entry. - if (prevFieldName == null) { - prevFieldName = fieldName; - prevDatatype = datatype; - prevDate = date; - startDate = date; - prevColumnFamily = columnFamily; - continue; - } - - // The column family is different. We have two possible scenarios: - // - The previous column family was 'f'. The current row is an index row for to the current field. - // - The previous column family was the target index column family. The current row is an 'f' row for a new field. - // - // In both cases, record the last date range, and begin collecting date ranges for the next batch of related rows. - if (!prevColumnFamily.equals(columnFamily)) { - // Add the latest date range seen for the previous fieldName-datatype combination. - Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); - dates.add(dateRange); + for (Map.Entry entry : scanner) { + // Parse the current row. + Key key = entry.getKey(); + currFieldName = key.getRow().toString(); + currColumnFamily = key.getColumnFamily(); - // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for the - // previously collected data. - if (columnFamily.equals(ColumnFamilyConstants.COLF_F)) { - Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); - fieldIndexHoles.put(prevFieldName, datatypeHoles); - // Clear the date range maps. - frequencyMap.clear(); - indexMap.clear(); - // Set the target date map to the frequency map. - dateMap = frequencyMap; - } else { - // The current column family is the target index. Add the latest date range seen for the previous datatype. - dateMap = indexMap; + String cq = key.getColumnQualifier().toString(); + int offset = cq.indexOf(NULL_BYTE); + currDatatype = cq.substring(0, offset); + + // Check if the current field and datatype are part of the fields and datatypes we want to retrieve field index holes for. + if (!isPartOfTarget(currFieldName, currDatatype)) { + continue; } - // Update our tracking variables. - prevFieldName = fieldName; - prevDatatype = datatype; - startDate = date; - } else { - // The column family is the same. We have three possible scenarios: - // - A row with a field that is different to the previous field. - // - A row with the same field and datatype. - // - A row with the same field, but a different datatype. + + currDate = DateHelper.parse(cq.substring((offset + 1))); + + ByteArrayInputStream byteStream = new ByteArrayInputStream(entry.getValue().get()); + DataInputStream inputStream = new DataInputStream(byteStream); + currCount = WritableUtils.readVLong(inputStream); + + // If this is the very first entry we've looked at, update our tracking variables, add the current entry to the target map, and continue to the + // next + // entry. + if (prevFieldName == null) { + addToTargetMap(currDatatype, currDate, currCount); + + prevFieldName = currFieldName; + prevColumnFamily = currColumnFamily; + continue; + } + + // The column family is different. We have two possible scenarios: + // - The previous column family was 'f'. The current row is an index row for to the current field. + // - The previous column family was the target index column family. The current row is an 'f' row for a new field. // - // We have encountered a new field name and the previous fieldName-datatype combination did not have any corresponding index row entries. - if (!fieldName.equals(prevFieldName)) { - // Add the latest date range seen for the previous fieldName. - Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); - dates.add(dateRange); - // Add the field index holes for the previous field name. - Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); - fieldIndexHoles.put(prevFieldName, datatypeHoles); - // Clear the date range maps. - frequencyMap.clear(); - indexMap.clear(); - // Update our tracking variables. - prevFieldName = fieldName; - prevDatatype = datatype; - startDate = date; - } else if (datatype.equals(prevDatatype)) { - // We are on the same fieldName-datatype combination as the previous row. Determine if we can add a date-range. - calendar.setTime(prevDate); - calendar.add(Calendar.DATE, 1); - // If the current date is not one day after the previous date, it is not a continuous part of the previously tracked date range. Save the - // previous date range and begin a new one. - if (!calendar.getTime().equals(date)) { - // The current date should not be included in the current date range. Add the current date range, and start a new one. - Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(datatype, (k) -> new TreeSet<>()); - dates.add(dateRange); - - // Update the date tracking variables. - startDate = date; + // In both cases, record the last entry, and begin collecting date ranges for the next batch of related rows. + if (!prevColumnFamily.equals(currColumnFamily)) { + // The column family is "f". We have collected the date ranges for all datatypes for the previous field name. Get the field index holes for + // the + // previously collected data. + if (currColumnFamily.equals(ColumnFamilyConstants.COLF_F)) { + // Find and add all field index holes for the current frequency and index entries. + findFieldIndexHoles(prevFieldName); + // Clear the entry maps. + clearEntryMaps(); + // Set the target map to the frequency map. + this.targetMap = frequencyMap; + } else { + // The current column family is the target index column family. Set the target map to the index map. + this.targetMap = indexMap; } + + // Add the current entry to the target entry map. + addToTargetMap(currDatatype, currDate, currCount); } else { - // We've encountered a new datatype. Add the latest date range seen for the previous datatype. - Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); - dates.add(dateRange); + // The column family is the same. We have two possible scenarios: + // - A row with a field that is different to the previous field. + // - A row with the same field. - // Update our tracking variables. - prevDatatype = datatype; - startDate = date; + // We have encountered a new field name and the previous fieldName-datatype combination did not have any corresponding index row entries. + if (!currFieldName.equals(prevFieldName)) { + // Find and add all field index holes for the current frequency and index entries. + findFieldIndexHoles(prevFieldName); + // Clear the entry maps. + clearEntryMaps(); + // Add the current entry to the target entry map. + addToTargetMap(currDatatype, currDate, currCount); + } else { + // The current row has the same field. Add the current entry to the target map. + addToTargetMap(currDatatype, currDate, currCount); + } } + + // Set the values for our prev entry to the current entry. + prevFieldName = currFieldName; + prevColumnFamily = currColumnFamily; } - // Update the previous date and column family. - prevDate = date; - prevColumnFamily = columnFamily; + + // After there are no more rows, ensure that we find any field index holes that exist in the last batch of entries. + findFieldIndexHoles(prevFieldName); + + // Return the field index holes as an immutable structure. + return getImmutableFieldIndexHoles(); } - // After there are no more rows, ensure that we record the last date range for the last fieldName-datatype combination that we saw. - Pair dateRange = Pair.of(startDate, prevDate); - SortedSet> dates = dateMap.computeIfAbsent(prevDatatype, (k) -> new TreeSet<>()); - dates.add(dateRange); - - // Get the field index holes for the previous field name. - Multimap> datatypeHoles = getFieldIndexHoles(frequencyMap, indexMap); - fieldIndexHoles.put(prevFieldName, datatypeHoles); - - // Create immutable versions of the field index holes, and do not retain any empty collections. - ImmutableMap.Builder> fieldMapBuilder = new ImmutableMap.Builder<>(); - for (String fieldName : fieldIndexHoles.keySet()) { - Multimap> datatypeMap = fieldIndexHoles.get(fieldName); - if (!datatypeMap.isEmpty()) { - ImmutableMap.Builder datatypeMapBuilder = new ImmutableMap.Builder<>(); - for (String datatype : datatypeMap.keySet()) { - FieldIndexHole fieldIndexHole = new FieldIndexHole(fieldName, datatype, datatypeMap.get(datatype)); - datatypeMapBuilder.put(datatype, fieldIndexHole); + /** + * Return whether the given field and datatype represent a pairing that should be evaluated for field index holes. + */ + private boolean isPartOfTarget(String field, String datatype) { + return (!filterFields || fields.contains(field)) && (!filterDatatypes || datatypes.contains(datatype)); + } + + /** + * Add the current date and count to the current target map for the current datatype. + */ + private void addToTargetMap(String datatype, Date date, Long count) { + SortedMap datesToCounts = targetMap.computeIfAbsent(datatype, (k) -> new TreeMap<>()); + datesToCounts.put(date, count); + } + + /** + * Clear the maps {@link #frequencyMap} and {@link #indexMap}. + */ + private void clearEntryMaps() { + this.frequencyMap.clear(); + this.indexMap.clear(); + } + + /** + * Find all field index holes for given field name, and store them in {@link #fieldIndexHoles}. + * + * @param fieldName + * the field name + */ + private void findFieldIndexHoles(String fieldName) { + Multimap> indexHoles = fieldIndexHoles.computeIfAbsent(fieldName, (k) -> HashMultimap.create()); + // Compare the entries for each datatype to identify any and all field index holes. + for (String datatype : frequencyMap.keySet()) { + // At least one corresponding index row was seen. Compare the entries to identify any index holes. + if (indexMap.containsKey(datatype)) { + // Add all index holes found for the entries for the current datatype. + Set> holes = getIndexHoles(frequencyMap.get(datatype), indexMap.get(datatype)); + indexHoles.putAll(datatype, holes); + } else { + // No corresponding index rows were seen for any of the frequency rows. Each date is an index hole. Add a date range of the earliest date to + // the latest date. + SortedMap entryMap = frequencyMap.get(datatype); + indexHoles.put(datatype, Pair.of(entryMap.firstKey(), entryMap.lastKey())); } - fieldMapBuilder.put(fieldName, datatypeMapBuilder.build()); } } - // Return the finalized field index holes. - return fieldMapBuilder.build(); - } - - private Multimap> getFieldIndexHoles(Map>> frequencyMap, - Map>> indexMap) { - // New tracking variables. - String prevDataType = null; - Pair prevFrequencyDateRange = null; - Date holeStartDate = null; - Multimap> fieldIndexHoles = HashMultimap.create(); - Calendar calendar = Calendar.getInstance(); - - // Compare the date ranges for each datatype to identify any and all field index holes. Evaluate the date ranges for each datatype. - for (String datatype : frequencyMap.keySet()) { - // If holeStartDate is not null, we have a hole left over from the previous datatype combination. The index hole spans from the hole - // start date to the end of the last frequency date range. - if (holeStartDate != null) { - fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); - holeStartDate = null; - } + /** + * Return a set of all index hole date ranges found for the given maps of frequency and index entries. + * + * @param frequencyMap + * the frequency entries + * @param indexMap + * the index entries + * @return a set of index holes, possibly empty, but never null + */ + private Set> getIndexHoles(SortedMap frequencyMap, SortedMap indexMap) { + Set> indexHoles = new HashSet<>(); + Date holeStartDate = null; + Date prevDate = null; - // At least one corresponding index row was seen. Compare the date ranges to identify any index holes. - if (indexMap.containsKey(datatype)) { - SortedSet> frequencyDates = frequencyMap.get(datatype); - Iterator> indexDatesIterator = indexMap.get(datatype).iterator(); - Pair prevIndexDateRange = null; - boolean comparePrevIndexDateRange = false; - // Evaluate each date range we saw for frequency rows for the current fieldName-datatype. - for (Pair frequencyDateRange : frequencyDates) { - Date frequencyStartDate = frequencyDateRange.getLeft(); - Date frequencyEndDate = frequencyDateRange.getRight(); - - // If it's been flagged that we need to compare the previous index date range to the current frequency date range, do so. This is done when - // we potentially have an index hole that spans over the end of the previous frequency date range and the start of the next frequency date - // range. - if (comparePrevIndexDateRange) { - Date indexStartDate = prevIndexDateRange.getLeft(); - Date indexEndDate = prevIndexDateRange.getRight(); - - // If holeStartDate is not null, we have an index hole left over from the previous frequency date range. The index hole spans from the - // hole start date to the end of the last frequency date range. + for (Date date : frequencyMap.keySet()) { + // There is a corresponding index entry for the current date. + if (indexMap.containsKey(date)) { + // The count for the current index entry meets the minimum threshold. + if (meetsMinThreshold(frequencyMap.get(date), indexMap.get(date))) { + // The previous entry was part of an index hole. Capture the index hole range. if (holeStartDate != null) { - fieldIndexHoles.put(datatype, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + indexHoles.add(Pair.of(holeStartDate, prevDate)); holeStartDate = null; } - - // The index start date is equal to the frequency start date. Check for a hole. - if (indexStartDate.equals(frequencyStartDate)) { - if (!indexEndDate.equals(frequencyEndDate)) { - // There is an index hole starting the day after the index end date. We must evaluate the next index date range to determine the - // end date of the index hole. - calendar.setTime(indexEndDate); - calendar.add(Calendar.DATE, 1); - holeStartDate = calendar.getTime(); - } - // Otherwise there is no index hole here. - } else { - // The index start date is after the frequency start date. Check if we have a hole that partially covers the frequency date range, - // or all of it. - if (indexStartDate.before(frequencyEndDate)) { - // There is an index hole starting on the frequency start date, and ending the day before the index start date. - calendar.setTime(indexStartDate); - calendar.add(Calendar.DATE, -1); - fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); - - if (indexEndDate.before(frequencyEndDate)) { - // There is an index hole starting the day after the index end date. We must evaluate the next index date range to determine - // the end date of the index hole. - calendar.setTime(indexEndDate); - calendar.add(Calendar.DATE, 1); - holeStartDate = calendar.getTime(); - } - } else { - // The entire frequency date range is an index hole. Add it as such, and continue to the next frequency date range. We want to - // compare the current index date range to the next frequency date range as well. - fieldIndexHoles.put(datatype, frequencyDateRange); - continue; - } + } else { + // The count for the current index entry does not meet the minimum threshold, and thus this entry is part of an index hole. Mark the + // start + // of an index hole date range if we have not already found one. + if (holeStartDate == null) { + holeStartDate = date; } - comparePrevIndexDateRange = false; } - - // Evaluate each index date range against the current frequency date range. If we see an index date range that begins after the current - // frequency date range, we will skip to the next frequency date range. - while (indexDatesIterator.hasNext()) { - Pair indexDateRange = indexDatesIterator.next(); - Date indexStartDate = indexDateRange.getLeft(); - Date indexEndDate = indexDateRange.getRight(); - - if (indexStartDate.equals(frequencyStartDate)) { - if (indexEndDate.equals(frequencyEndDate)) { - // The current index date range is equal to the current frequency date rang, and there is no index hole for the current - // frequency date range. Break out of the loop and continue to the next frequency date range. - prevIndexDateRange = indexDateRange; - break; - } else { - // There is an index hole starting the day after the index end date. Mark the start date, and continue to the next index date - // range to determine the end date. - calendar.setTime(indexEndDate); - calendar.add(Calendar.DATE, 1); - holeStartDate = calendar.getTime(); - } - } else if (indexStartDate.before(frequencyEndDate)) { - calendar.setTime(indexStartDate); - calendar.add(Calendar.DATE, -1); - if (holeStartDate != null) { - // If holeStartDate is not null, we've previously identified the start of an index hole that is not the start of the frequency - // date range. There is an index hole from holeStartDate to the day before the index start date. - fieldIndexHoles.put(datatype, Pair.of(holeStartDate, calendar.getTime())); - holeStartDate = null; - } else { - // There is an index hole from the frequency start date to the day before the index start date. - fieldIndexHoles.put(datatype, Pair.of(frequencyStartDate, calendar.getTime())); - } - - // It's possible for the current index date range to end before the current frequency date range. If so, this indicates a new index - // hole. - if (indexEndDate.before(frequencyEndDate)) { - // There is an index hole starting the day after the index end date. We need to evaluate the next index date range to determine - // the end of the index hole. Mark the start of this new index hole. - calendar.setTime(indexEndDate); - calendar.add(Calendar.DATE, 1); - holeStartDate = calendar.getTime(); - } - } else { - // The start of the current index date range occurs after the current frequency date range. There is a hole in the current frequency - // date range. - if (holeStartDate == null) { - // The entire current frequency date range is an index hole. Add it as such and break out to continue to the next frequency - // date range. - fieldIndexHoles.put(datatype, frequencyDateRange); - break; - } else { - // There is an index hole from the recorded hole start date to the end of the frequency date range. Add it as such and break - // out to continue to the next frequency date range. - fieldIndexHoles.put(datatype, Pair.of(holeStartDate, frequencyEndDate)); - holeStartDate = null; - // The current index date range is entirely after the current frequency date range. As such, we need to compare the current - // index date range to the next frequency date range. - comparePrevIndexDateRange = true; - } - } - // Update the prev index date range. - prevIndexDateRange = indexDateRange; + } else { + // There is no corresponding index entry for the current date. This is the start of an index hole if we have not previously found one. + if (holeStartDate == null) { + holeStartDate = date; } - // Update the prev frequency date range. - prevFrequencyDateRange = frequencyDateRange; } - } else { - // No corresponding index rows were seen for any of the frequency rows. Each date range represents an index hole. - fieldIndexHoles.putAll(datatype, frequencyMap.get(datatype)); + // Track the previous date. + prevDate = date; + } + + // If we have finished looking at all dates, and we have a trailing index hole, capture the last index hole date range. + if (holeStartDate != null) { + indexHoles.add(Pair.of(holeStartDate, prevDate)); } - // Update the prev datatype. - prevDataType = datatype; + + return indexHoles; } - // If we have a non-null hole start date after processing all the date ranges, we have an index hole that ends at the last frequency date range seen - // for the last fieldName-datatype combination. - if (holeStartDate != null) { - fieldIndexHoles.put(prevDataType, Pair.of(holeStartDate, prevFrequencyDateRange.getRight())); + /** + * Return whether the given index count meets the minimum threshold for the given frequency count. + * + * @param frequencyCount + * the frequency count + * @param indexCount + * the index count + * @return true if the threshold is met, or false otherwise + */ + private boolean meetsMinThreshold(Long frequencyCount, Long indexCount) { + if (indexCount >= frequencyCount) { + return true; + } + + double percentage = indexCount.doubleValue() / frequencyCount; + return percentage >= minThreshold; } - return fieldIndexHoles; + /** + * Return an immutable version of {@link #fieldIndexHoles}, with all empty collections removed. + * + * @return an immutable map. + */ + private Map> getImmutableFieldIndexHoles() { + ImmutableMap.Builder> fieldMapBuilder = new ImmutableMap.Builder<>(); + + for (String fieldName : this.fieldIndexHoles.keySet()) { + Multimap> datatypeMap = this.fieldIndexHoles.get(fieldName); + if (!datatypeMap.isEmpty()) { + ImmutableMap.Builder datatypeMapBuilder = new ImmutableMap.Builder<>(); + for (String datatype : datatypeMap.keySet()) { + FieldIndexHole fieldIndexHole = new FieldIndexHole(fieldName, datatype, datatypeMap.get(datatype)); + datatypeMapBuilder.put(datatype, fieldIndexHole); + } + fieldMapBuilder.put(fieldName, datatypeMapBuilder.build()); + } + } + + return fieldMapBuilder.build(); + } } private static String getKey(String instanceID, String metadataTableName) { diff --git a/src/main/java/datawave/query/util/MetadataHelper.java b/src/main/java/datawave/query/util/MetadataHelper.java index 2b5e1d10..f7239af1 100644 --- a/src/main/java/datawave/query/util/MetadataHelper.java +++ b/src/main/java/datawave/query/util/MetadataHelper.java @@ -1418,20 +1418,36 @@ protected Date getEarliestOccurrenceOfFieldWithType(String fieldName, final Stri /** * Return the field index holes calculated between all "i" and "f" entries. The map consists of field names to datatypes to field index holes. - * + * + * @param fields + * the fields to fetch field index holes for, an empty set will result in all fields being fetched + * @param datatypes + * the datatypes to fetch field index holes for, an empty set will result in all datatypes being fetched + * @param minThreshold + * the minimum percentage threshold required for an index row to be considered NOT a hole on a particular date, expected to be a value between + * 0.0 (inclusive) to 1.0 (inclusive) * @return the field index holes */ - public Map> getFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { - return allFieldMetadataHelper.getFieldIndexHoles(); + public Map> getFieldIndexHoles(Set fields, Set datatypes, double minThreshold) + throws TableNotFoundException, IOException { + return allFieldMetadataHelper.getFieldIndexHoles(fields, datatypes, minThreshold); } /** * Return the field index holes calculated between all "ri" and "f" entries. The map consists of field names to datatypes to field index holes. - * + * + * @param fields + * the fields to fetch field index holes for, an empty set will result in all fields being fetched + * @param datatypes + * the datatypes to fetch field index holes for, an empty set will result in all datatypes being fetched + * @param minThreshold + * the minimum percentage threshold required for an index row to be considered NOT a hole on a particular date, expected to be a value between + * 0.0 (inclusive) to 1.0 (inclusive) * @return the field index holes */ - public Map> getReversedFieldIndexHoles() throws TableNotFoundException, CharacterCodingException { - return allFieldMetadataHelper.getReversedFieldIndexHoles(); + public Map> getReversedFieldIndexHoles(Set fields, Set datatypes, double minThreshold) + throws TableNotFoundException, IOException { + return allFieldMetadataHelper.getReversedFieldIndexHoles(fields, datatypes, minThreshold); } /** diff --git a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java index ebf2c561..0e0e3feb 100644 --- a/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java +++ b/src/test/java/datawave/query/util/AllFieldMetadataHelperTest.java @@ -1,14 +1,15 @@ package datawave.query.util; import java.io.File; +import java.io.IOException; import java.net.URISyntaxException; -import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -50,7 +51,6 @@ class AllFieldMetadataHelperTest { private static final String TABLE_METADATA = "metadata"; private static final String[] AUTHS = {"FOO"}; private static final String NULL_BYTE = "\0"; - private static final Value NULL_VALUE = new Value(new byte[0]); private AccumuloClient accumuloClient; private AllFieldMetadataHelper helper; @@ -100,31 +100,43 @@ private void writeMutations(Collection mutations) { } /** - * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles()}. + * Tests for {@link AllFieldMetadataHelper#getFieldIndexHoles(Set, Set, double)} and + * {@link AllFieldMetadataHelper#getReversedFieldIndexHoles(Set, Set, double)}. */ @Nested public class FieldIndexHoleTests { - private final Supplier>> INDEX_FUNCTION = () -> { + private Set fields = new HashSet<>(); + private Set datatypes = new HashSet<>(); + private double minimumThreshold = 1.0d; + + protected final Supplier>> INDEX_FUNCTION = () -> { try { - return helper.getFieldIndexHoles(); - } catch (TableNotFoundException | CharacterCodingException e) { + return helper.getFieldIndexHoles(fields, datatypes, minimumThreshold); + } catch (TableNotFoundException | IOException e) { throw new RuntimeException(e); } }; - private final Supplier>> REVERSED_INDEX_FUNCTION = () -> { + protected final Supplier>> REVERSED_INDEX_FUNCTION = () -> { try { - return helper.getReversedFieldIndexHoles(); - } catch (TableNotFoundException | CharacterCodingException e) { + return helper.getReversedFieldIndexHoles(fields, datatypes, minimumThreshold); + } catch (TableNotFoundException | IOException e) { throw new RuntimeException(e); } }; - private Supplier>> getIndexHoleFunction(String cf) { + protected Supplier>> getIndexHoleFunction(String cf) { return cf.equals("i") ? INDEX_FUNCTION : REVERSED_INDEX_FUNCTION; } + @AfterEach + void tearDown() { + fields.clear(); + datatypes.clear(); + givenMinimumThreshold(1.0d); + } + /** * Test against data that has no field index holes. */ @@ -132,19 +144,19 @@ private Supplier>> getIndexHoleFunction(St @ValueSource(strings = {"i", "ri"}) void testNoFieldIndexHoles(String cf) { // Create a series of frequency rows over date ranges, each with a matching index row for each date. - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120"); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120"); - mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120"); - mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120"); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120"); - mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120"); + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200120", 1L); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200120", 1L); + mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200120", 1L); + mutationCreator.addFrequencyMutations("EVENT_DATE", "csv", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "csv", "20200101", "20200120", 1L); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200101", "20200120", 1L); + mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200101", "20200120", 1L); writeMutations(mutationCreator.getMutations()); // Verify that no index holes were found. @@ -153,15 +165,15 @@ void testNoFieldIndexHoles(String cf) { } /** - * Test against data that has field index holes for an entire fieldName-datatype combination. + * Test against data that has field index holes for an entire fieldName-datatype combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleForEntireFrequencyDateRange(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); // Do not create matching index rows for these. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + void testFieldIndexHoleForEntireFrequencyDateRange_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -170,18 +182,38 @@ void testFieldIndexHoleForEntireFrequencyDateRange(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } + + /** + * Test against data that has field index holes for an entire fieldName-datatype combination based on the threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEntireFrequencyDateRange_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Make the index counts a value that will not meet the threshold. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } /** - * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination. + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleForStartOfFrequencyDateRange(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105"); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + void testFieldIndexHoleForStartOfFrequencyDateRange_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -190,18 +222,40 @@ void testFieldIndexHoleForStartOfFrequencyDateRange(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } + + /** + * Test against data that has a field index hole at the start of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForStartOfFrequencyDateRange_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 5L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } /** - * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination. + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleForEndOfFrequencyDateRange(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + void testFieldIndexHoleForEndOfFrequencyDateRange_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -210,19 +264,41 @@ void testFieldIndexHoleForEndOfFrequencyDateRange(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } + + /** + * Test against data that has a field index hole at the end of a frequency date range for a given fieldName-dataType combination based on the + * threshold requirement. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForEndOfFrequencyDateRange_thresholds(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } /** - * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination. + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleForMiddleOfFrequencyDateRange(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110"); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105"); + void testFieldIndexHoleForMiddleOfFrequencyDateRange_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 1L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -231,18 +307,64 @@ void testFieldIndexHoleForMiddleOfFrequencyDateRange(String cf) { // @formatter:off Assertions.assertEquals(expected, fieldIndexHoles); } + + /** + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on the + * threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleForMiddleOfFrequencyDateRange_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); + writeMutations(mutationCreator.getMutations()); + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + /** - * Test against data that has multiple field index holes for a given fieldName-datatype combination. + * Test against data that has a field index hole in the middle of a frequency date range for a given fieldName-datatype combination based on both date + * gaps and the threshold. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testMultipleFieldIndexHolesInFrequencyDateRange(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118"); + void testFieldIndexHoleForMiddleOfFrequencyDateRange_mixed(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200110", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200106", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200110", 5L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200101", "20200105", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:on + Map> expected = createFieldIndexHoleMap(createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200106"))); + // @formatter:off + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldIndexHolesInFrequencyDateRange_dateGap(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -257,16 +379,45 @@ void testMultipleFieldIndexHolesInFrequencyDateRange(String cf) { } /** - * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination. + * Test against data that has multiple field index holes for a given fieldName-datatype combination based on the threshold. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); - mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105"); + void testMultipleFieldIndexHolesInFrequencyDateRange_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200120", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200106", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200107", "20200109", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200113", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200114", "20200116", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200117", "20200118", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200119", "20200120", 1L); // Will not meet threshold. + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200103"), + dateRange("20200107", "20200109"), + dateRange("20200114", "20200116"), + dateRange("20200119", "20200120"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_dateGap(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); + mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -278,37 +429,83 @@ void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo(String cf) { } /** - * Test against data where the expected index hole spans across multiple frequency ranges. + * Test against data where the expected index hole occurs for the end of a frequency range right before a new fieldName-datatype combination based on + * the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleAtEndOfFrequencyDateRangeForNonLastCombo_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200105", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("ZETA", "csv", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200101", "20200105", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200105"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 1L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where the expected index hole spans across multiple frequency ranges based on the threshold. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testFieldIndexHoleSpanningMultipleFrequencyDateRanges(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115"); + void testFieldIndexHoleSpanningMultipleFrequencyDateRanges_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200103", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200105", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200110", "20200112", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200113", "20200115", 5L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); // @formatter:off Map> expected = createFieldIndexHoleMap( - createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200105"), dateRange("20200110", "20200112"))); + createFieldIndexHole("NAME", "wiki", dateRange("20200104", "20200112"))); // @formatter:on Assertions.assertEquals(expected, fieldIndexHoles); } /** - * Test against data where everything is an index hole. + * Test against data where everything is an index hole based on date gaps. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testAllDatesAreIndexHoles(String cf) { - MutationCreator mutationCreator = new MutationCreator(); - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); + void testAllDatesAreIndexHoles_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -323,30 +520,58 @@ void testAllDatesAreIndexHoles(String cf) { } /** - * Test against data where we have a number of index holes that span just a day. + * Test against data where everything is an index hole based on the threshold. */ @ParameterizedTest @ValueSource(strings = {"i", "ri"}) - void testSingularDayIndexHoles(String cf) { - MutationCreator mutationCreator = new MutationCreator(); + void testAllDatesAreIndexHoles_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200105", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200115", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200125", 1L); // Will not meet threshold. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200328", 1L); // Will not meet threshold. + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200101", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200115")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200120", "20200125")), + createFieldIndexHole("URI", "maze", dateRange("20200216", "20200328"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on date gaps. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_dateGaps(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); // Index holes for NAME-wiki on 20200103 and 20200105. - mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102"); - mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104"); + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 1L); // Index holes for NAME-csv on 20200110 and 20200113. - mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112"); - mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115"); + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 1L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 1L); // Index hole for EVENT_DATE-wiki on 20200122. - mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125"); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121"); - mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125"); + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 1L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 1L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 1L); // Index holes for URI-maze on 20200221, 20200303, and 20200316. - mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328"); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220"); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302"); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315"); - mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328"); + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 1L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 1L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 1L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 1L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 1L); writeMutations(mutationCreator.getMutations()); Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); @@ -360,49 +585,427 @@ void testSingularDayIndexHoles(String cf) { // @formatter:on Assertions.assertEquals(expected, fieldIndexHoles); } - } - - private Map> createFieldIndexHoleMap(FieldIndexHole... holes) { - Map> fieldIndexHoles = new HashMap<>(); - for (FieldIndexHole hole : holes) { - Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); - datatypeMap.put(hole.getDatatype(), hole); + + /** + * Test against data where we have a number of index holes that span just a day based on the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testSingularDayIndexHoles_threshold(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200113", "20200113", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test against data where we have a number of index holes that span just a day based on both dates and the threshold. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMixedDateGapsAndThresholdIndexHoles(String cf) { + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying a minimum percentage threshold other than the default of 1.0. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMinimumThresholdPercentageBelow100(String cf) { + givenMinimumThreshold(0.75); // Index count must meet 75% of frequency count to not be considered field index hole. + + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 100L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 75L); // Meets 75% threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 100L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 74L); // Will not meet threshold. + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 100L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 74L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 75L); // Meets 75% threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 100L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 100L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 98L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 74L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 75L); // Meets 75% threshold. + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 100L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 100L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 74L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 90L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 75L); // Meets 75% threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 74L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 99L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying one field to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testOneFieldSpecified(String cf) { + // Retrieve field index holes for field NAME. + givenFields("NAME"); + + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for NAME-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("NAME", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("NAME", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying multiple fields to filter on. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testMultipleFieldsSpecified(String cf) { + // Retrieve field index holes for fields URI and EVENT_DATE. + givenFields("URI", "EVENT_DATE"); + + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for ALPHA-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + // Index hole for ZETA-wiki on 20200122. + mutationCreator.addFrequencyMutations("ZETA", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ZETA", "wiki", "20200123", "20200125", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("URI", "maze", dateRange("20200221", "20200221"), dateRange("20200303", "20200303"), + dateRange("20200316", "20200316"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testDatatypesSpecified(String cf) { + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for ALPHA-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index hole for EVENT_DATE-maze on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + // Index hole for ZETA-csv on 20200122. + mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + // Index hole for ZETA-imdb on 20200122. + mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ALPHA", "csv", dateRange("20200110", "20200110"), dateRange("20200113", "20200113")), + createFieldIndexHole("EVENT_DATE", "wiki", dateRange("20200122", "20200122")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + /** + * Test specifying fields and datatypes. + */ + @ParameterizedTest + @ValueSource(strings = {"i", "ri"}) + void testFieldsAndDatatypesSpecified(String cf) { + // Retrieve field index holes for fields NAME and ZETA. + givenFields("NAME", "ZETA"); + // Retrieve field index holes for datatypes wiki and csv. + givenDatatypes("wiki", "csv"); + + FieldIndexHoleMutationCreator mutationCreator = new FieldIndexHoleMutationCreator(); + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "wiki", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "wiki", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for NAME-wiki on 20200103 and 20200105. + mutationCreator.addFrequencyMutations("NAME", "maze", "20200101", "20200105", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200101", "20200102", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200103", "20200103", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200104", "20200104", 5L); + mutationCreator.addIndexMutations(cf, "NAME", "maze", "20200105", "20200105", 1L); // Will not meet threshold. + // Index holes for ALPHA-csv on 20200110 and 20200113. + mutationCreator.addFrequencyMutations("ALPHA", "csv", "20200110", "20200115", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200110", "20200110", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200111", "20200112", 5L); + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200113", "20200113", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ALPHA", "csv", "20200114", "20200115", 5L); + // Index hole for EVENT_DATE-wiki on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "wiki", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "wiki", "20200123", "20200125", 5L); + // Index hole for EVENT_DATE-maze on 20200122. + mutationCreator.addFrequencyMutations("EVENT_DATE", "maze", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "EVENT_DATE", "maze", "20200123", "20200125", 5L); + // Index holes for URI-maze on 20200221, 20200303, and 20200316. + mutationCreator.addFrequencyMutations("URI", "maze", "20200216", "20200328", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200216", "20200220", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200221", "20200221", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200222", "20200302", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200303", "20200303", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200304", "20200315", 5L); + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200316", "20200316", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "URI", "maze", "20200317", "20200328", 5L); + // Index hole for ZETA-csv on 20200122. + mutationCreator.addFrequencyMutations("ZETA", "csv", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ZETA", "csv", "20200123", "20200125", 5L); + // Index hole for ZETA-imdb on 20200122. + mutationCreator.addFrequencyMutations("ZETA", "imdb", "20200120", "20200125", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200120", "20200121", 5L); + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200122", "20200122", 1L); // Will not meet threshold. + mutationCreator.addIndexMutations(cf, "ZETA", "imdb", "20200123", "20200125", 5L); + writeMutations(mutationCreator.getMutations()); + + Map> fieldIndexHoles = getIndexHoleFunction(cf).get(); + // @formatter:off + Map> expected = createFieldIndexHoleMap( + createFieldIndexHole("NAME", "wiki", dateRange("20200103", "20200103"), dateRange("20200105", "20200105")), + createFieldIndexHole("ZETA", "csv", dateRange("20200122", "20200122"))); + // @formatter:on + Assertions.assertEquals(expected, fieldIndexHoles); + } + + private void givenFields(String... fields) { + this.fields = Sets.newHashSet(fields); + } + + private void givenDatatypes(String... datatypes) { + this.datatypes = Sets.newHashSet(datatypes); + } + + private void givenMinimumThreshold(double minimumThreshold) { + this.minimumThreshold = minimumThreshold; + } + + protected Map> createFieldIndexHoleMap(FieldIndexHole... holes) { + Map> fieldIndexHoles = new HashMap<>(); + for (FieldIndexHole hole : holes) { + Map datatypeMap = fieldIndexHoles.computeIfAbsent(hole.getFieldName(), k -> new HashMap<>()); + datatypeMap.put(hole.getDatatype(), hole); + } + return fieldIndexHoles; + } + + @SafeVarargs + protected final FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { + return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); + } + + protected Pair dateRange(String start, String end) { + return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); } - return fieldIndexHoles; - } - - @SafeVarargs - private FieldIndexHole createFieldIndexHole(String field, String datatype, Pair... dateRanges) { - return new FieldIndexHole(field, datatype, Sets.newHashSet(dateRanges)); - } - - private Pair dateRange(String start, String end) { - return Pair.of(DateHelper.parse(start), DateHelper.parse(end)); } /** - * Helper class for creating mutations in bulk. + * Helper class for creating mutations in bulk for field index hole tests. */ - private static class MutationCreator { + private static class FieldIndexHoleMutationCreator { private final List mutations = new ArrayList<>(); - private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate) { + private void addFrequencyMutations(String fieldName, String datatype, String startDate, String endDate, long count) { List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addFrequencyMutation(fieldName, datatype, date)); + dates.forEach(date -> addMutation(fieldName, "f", datatype, date, count)); } - private void addFrequencyMutation(String fieldName, String datatype, String date) { - addMutation(fieldName, "f", datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); - } - - private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate) { + private void addIndexMutations(String cf, String fieldName, String datatype, String startDate, String endDate, long count) { List dates = getDatesInRange(startDate, endDate); - dates.forEach(date -> addIndexMutation(cf, fieldName, datatype, date)); - } - - private void addIndexMutation(String cf, String fieldName, String datatype, String date) { - addMutation(fieldName, cf, datatype + NULL_BYTE + date, NULL_VALUE); + dates.forEach(date -> addMutation(fieldName, cf, datatype, date, count)); } private List getDatesInRange(String startDateStr, String endDateStr) { @@ -427,9 +1030,9 @@ private List getDatesInRange(String startDateStr, String endDateStr) { return dates; } - private void addMutation(String row, String columnFamily, String columnQualifier, Value value) { + private void addMutation(String row, String columnFamily, String datatype, String date, long count) { Mutation mutation = new Mutation(row); - mutation.put(columnFamily, columnQualifier, value); + mutation.put(columnFamily, datatype + NULL_BYTE + date, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(count))); mutations.add(mutation); } @@ -437,4 +1040,5 @@ private List getMutations() { return mutations; } } + }