Java tutorial
/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.flatten; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.NavigableSet; import java.util.PriorityQueue; import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.UUID; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.LongStream; import java.util.stream.Stream; import javax.inject.Inject; import org.diqube.context.AutoInstatiate; import org.diqube.data.column.ColumnPage; import org.diqube.data.column.ColumnPageFactory; import org.diqube.data.column.ColumnShardFactory; import org.diqube.data.column.ColumnType; import org.diqube.data.column.StandardColumnShard; import org.diqube.data.dictionary.Dictionary; import org.diqube.data.flatten.FlattenDataFactory; import org.diqube.data.flatten.FlattenedTable; import org.diqube.data.table.Table; import org.diqube.data.table.TableFactory; import org.diqube.data.table.TableShard; import org.diqube.data.types.dbl.dict.ConstantDoubleDictionary; import org.diqube.data.types.dbl.dict.DoubleDictionary; import org.diqube.data.types.lng.dict.ConstantLongDictionary; import org.diqube.data.types.lng.dict.LongDictionary; import org.diqube.data.types.str.dict.ConstantStringDictionary; import org.diqube.data.types.str.dict.StringDictionary; import org.diqube.executionenv.querystats.QueryableLongColumnShardFacade; import org.diqube.executionenv.util.ColumnPatternUtil; import org.diqube.executionenv.util.ColumnPatternUtil.ColumnPatternContainer; import org.diqube.executionenv.util.ColumnPatternUtil.LengthColumnMissingException; import org.diqube.executionenv.util.ColumnPatternUtil.PatternException; import org.diqube.loader.LoaderColumnInfo; import org.diqube.loader.columnshard.ColumnPageBuilder; import org.diqube.loader.columnshard.ColumnShardBuilder; import org.diqube.loader.compression.CompressedDoubleDictionaryBuilder; import org.diqube.loader.compression.CompressedLongDictionaryBuilder; import org.diqube.loader.compression.CompressedStringDictionaryBuilder; import org.diqube.name.FlattenedTableNameUtil; import org.diqube.name.RepeatedColumnNameGenerator; import org.diqube.util.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import com.google.common.collect.Sets; /** * Flattens a {@link Table} on a specific (repeated) field, i.e. that for each entry in the repeated field that is * denoted by the flatten-by field the resulting table will contain a separate row. * * <p> * The resulting table will have a different number of rows, as for each index of the repeated field of each row, a new * row will be provided. * * <p> * Example input table with two rows and a nested array: * * <pre> * { a : [ { b : 1 }, * { b : 2 } ], * c : 9 }, * { a : [ { b : 3 }, * { b : 4 } ], * c : 10} * </pre> * * When flattenning this over "a[*]", all elements in the a[.] array are separated into a single row (= table with 4 * rows): * * <pre> * { a.b : 1, c : 9 }, * { a.b : 2, c : 9 }, * { a.b : 3, c : 10 }, * { a.b : 4, c : 10 } * </pre> * * <p> * Note that values are not validated anyhow. That means that if a specific entry in the array did not have all fields * defined, those non-defined fields will be non-defined in the resulting rows. TODO #14: Support optional columns. * * @author Bastian Gloeckle */ @AutoInstatiate public class Flattener { private static final Logger logger = LoggerFactory.getLogger(Flattener.class); @Inject private FlattenDataFactory factory; @Inject private RepeatedColumnNameGenerator repeatedColNameGen; @Inject private FlattenedTableNameUtil flattenedTableNameGen; @Inject private ColumnPatternUtil colPatternUtil; @Inject private ColumnPageFactory columnPageFactory; @Inject private ColumnShardFactory columnShardFactory; @Inject private TableFactory tableFactory; /** * Flatten the given table by the given flatten-by field, returning a premilinary flattened table (see below). * * <p> * For details, see class doc. * * <p> * For each TableShard, one new {@link TableShard} will be created. Note that each flattened table shard will have the * same firstRowId as the corresponding input table Shard - although the flattened shards will usually contain more * rows. This means that most probably the rowIds will be overlapping in the returned flattenedTable! <b>This needs to * be fixed after calling this method, otherwise the Table is not usable!</b>. Typically a table is spread over * multiple cluster nodes, which means that fixing the firstRowIds requires communicating with the other nodes, * therefore this util class does not take care of this. * * @param inputTable * The table that should be flattened. This cannot be an already flattened table. * @param inputTableShards * Specify the tableShards to work on. If this is not set (== <code>null</code>), then the tableShards will * be read from the inputTable. * @param flattenByField * The field which should be flattened by, in the usual "all-array-notation" as defined in * {@link RepeatedColumnNameGenerator} (e.g. a[*].c.b[*] to get a single row for each index in all the "b" * arrays in a[*].c). * @param flattenId * The ID of the flattening that should be used to generate the output table name. * @return The flattened table. * @throws IllegalArgumentException * If a passed argument is invalid. * @throws PatternException * If the flattenByField pattern was not recognized. * @throws LengthColumnMissingException * If any required "length" col is missing. * @throws IllegalStateException * If the table cannot be flattened for any reason. */ public FlattenedTable flattenTable(Table inputTable, Collection<TableShard> inputTableShards, String flattenByField, UUID flattenId) throws IllegalArgumentException, IllegalStateException, PatternException, LengthColumnMissingException { if (inputTable instanceof FlattenedTable) throw new IllegalArgumentException("Cannot flatten an already flattened table."); if (!flattenByField.endsWith(repeatedColNameGen.allEntriesIdentifyingSubstr())) throw new IllegalArgumentException("Flatten-By field does not end with '" + repeatedColNameGen.allEntriesIdentifyingSubstr() + "'"); String resultTableName = flattenedTableNameGen.createFlattenedTableName(inputTable.getName(), flattenByField, flattenId); if (inputTableShards == null) inputTableShards = inputTable.getShards(); List<TableShard> flattenedTableShards = new ArrayList<>(); for (TableShard shard : inputTableShards) flattenedTableShards.add(flattenTableShard(resultTableName, shard, flattenByField)); Set<Long> firstRowIdsOfInputShards = inputTableShards.stream().map(shard -> shard.getLowestRowId()) .collect(Collectors.toSet()); return factory.createFlattenedTable(resultTableName, flattenedTableShards, firstRowIdsOfInputShards); } /** * Flattens a single {@link TableShard}. * * <p> * This works as follows: * * <ol> * <li>Find all patterns the flatten-by-field pattern matches to. These are then the prefixes of the column names of * which a new row will be created. * <li>Also find the names of the length columns of these patterns. * <li>Produce a to-do list: What is the name of the output columns and what input columns is that output column * created from? * <ul> * <li>Is the new column a "multiplicating col"? These cols are cols that are outside of the path of the repeated * column that is flattened over. Nevertheless each input col contains a value for that row: A single row-value of the * input columns needs to be available for multiple cols on the output table. * <li>Remove previously found length-columns from to-be-created col list (when flattening over a[*] we do not want a * a[length] column to appear in the output!). * </ul> * <li>Iterate over all rows of the input col and identify for each row and identify (1) how many output rows that row * will create (taking into account the length columns of the flatten-by field in that row) and (2) if this row is * missing of any child-fields (i.e. there is an array a[*].c[*], when flattening over a[*], there are output cols * a.c[0], a.c[1], a.c[2], but it could be that a specific row does not contain a.c[2], because that row simply does * not have that many entries in the array. * <li>Build the new columns - each new column can be either "multiplicating" (see above), in which case the col pages * are repeated accordingly (and no-longer repeated rows are removed from the repeated colpages) or they can be * "flattned" - in which case the col is a sub-field of the flattened one and we only need to remove rows that do not * contain any value. * </ol> * * We need to ensure that we do not mess up with the row-ordering of the various output columns: Each output column * needs to have the same number of rows and the rowIds need to match correctly. Therefore, when creating a column * e.g. based on inputColumns where we do not have realized all, we need to insert "constant" column pages into the * output which will then resolve to default values. Example: * * Source table: * * <pre> * {a:[ { b:[1] }, * { b:[2, 3] }]}, * {a:[ { b:[4] }, * { b:[5, 6] }]} * </pre> * * In this example, there will be no column a[0].b[1] in the input (as all a[0]s only have at max a single entry in * .b). Would we now map new columns to col pages of old columns in the following way (flattened over a[*]; displayed * is the list of col pages that are consecutively accessed for a new column): * * <pre> * a.b[0] = [ all col pages of a[0].b[0] ] * a.b[1] = [ all col pages of a[0].b[1], all col pages of a[1].b[1] ] * a.b[length] = [ all col pages of a[0].b[length], all col pages of a[1].b[length] ] * </pre> * * .. in that way we would mess up as a.b[0] would have less rows than a.b[1] -> we need to add a "constant" colPage * to a.b[0] to resolve to a default value. Note that we nevertheless will probably never resolve those default values * (at least in this example) as the a.b[length] value will not allow us to iterate that far in the corresponding * rows. * * <p> * Note that the resulting TableShard will have the same first Row ID as the input TableShard. If multiple TableShards * of the same table are flattened (this is usually the case), then after flattening them, the row IDs might overlap * (since every TableShard has the original firstRow ID, but each table shard contains more rows). The rowIds need to * be adjusted afterwards!. */ private TableShard flattenTableShard(String resultTableName, TableShard inputTableShard, String flattenByField) throws PatternException, LengthColumnMissingException, IllegalStateException { String[] flattenFieldSplit = flattenByField .split(Pattern.quote(repeatedColNameGen.allEntriesIdentifyingSubstr() + ".")); List<String> repeatedFieldsAlongPath = new ArrayList<>(); String prev = ""; for (String splitPart : flattenFieldSplit) { if (!"".equals(prev)) prev += "."; prev += splitPart; if (!splitPart.endsWith(repeatedColNameGen.allEntriesIdentifyingSubstr())) prev += repeatedColNameGen.allEntriesIdentifyingSubstr(); repeatedFieldsAlongPath.add(prev); } // calculate the most specific patterns first - colPatternUtil will return its lists in the same ordering! repeatedFieldsAlongPath = Lists.reverse(repeatedFieldsAlongPath); Set<String> allInputLengthColsOfFlattenedFields = new HashSet<>(); ColumnPatternContainer patterns = colPatternUtil.findColNamesForColNamePattern(lengthColName -> { allInputLengthColsOfFlattenedFields.add(lengthColName); return new QueryableLongColumnShardFacade(inputTableShard.getLongColumns().get(lengthColName)); }, repeatedFieldsAlongPath); // transpose result of colPatternUtil: Collect all the most specific patterns in a set, then the second-most // specific patterns etc. // Later we want to first check if a colname matches one of the most specfic patterns as prefix and replace that, // before checking if it matches some less-specific patterns. List<Set<String>> prefixesToReplace = new ArrayList<>(); for (int i = 0; i < repeatedFieldsAlongPath.size(); i++) prefixesToReplace.add(new HashSet<>()); for (List<String> patternList : patterns.getMaximumColumnPatterns()) { for (int i = 0; i < patternList.size(); i++) prefixesToReplace.get(i).add(patternList.get(i)); } // Prefix replacements based on index in prefixesToReplace: If a prefix of prefixesToReplace.get(0) is found, that // prefix needs to be replaced by replacements.get(0). List<String> replacements = repeatedFieldsAlongPath.stream().map( pattern -> pattern.replaceAll(Pattern.quote(repeatedColNameGen.allEntriesIdentifyingSubstr()), "")) .collect(Collectors.toList()); // map from new column name to input column names that column is based upon. Note that input col names might not // exist in inputTableShard, see comments below when newColumn is filled. Map<String, SortedSet<String>> newColumns = new HashMap<>(); // output cols whose row-values are based on using input cols values and each row value of those inputs is the value // of multiple output cols Set<String> multiplicatingOutputCols = new HashSet<>(); Set<String> allInputColNames = inputTableShard.getColumns().keySet(); for (String inputColName : allInputColNames) { if (allInputLengthColsOfFlattenedFields.contains(inputColName)) // Remove certian length columns from the set of to-be-created columns. For example when flattenning over a[*], // we do not want to create a[length] column, as it simply does not make sense any more as each of the entries // in a[*] is now a separate row. continue; String newColName = null; String foundPrefix = null; int foundPatternIdx = -1; for (int patternIdx = 0; patternIdx < prefixesToReplace.size(); patternIdx++) { Set<String> prefixes = prefixesToReplace.get(patternIdx); for (String prefix : prefixes) { if (inputColName.startsWith(prefix)) { newColName = inputColName.replaceFirst(Pattern.quote(prefix), replacements.get(patternIdx)); foundPrefix = prefix; foundPatternIdx = patternIdx; if (patternIdx > 0) // not the first list of prefixes matched (= created from pattern equalling the "flatten-by"), but // less-specific patterns matched. That means that this column needs to act in a way, that the value of // one input row needs to be projected to multiple rows on the output side. // Example: matched: a[0], but flattened over a[*].b[*] multiplicatingOutputCols.add(newColName); break; } } if (newColName != null) break; } if (newColName == null) { // no replacement found, this column is on different path than the flattened one, do not flatten, do not // replace. newColName = inputColName; // At the same time, this column needs to be multiplied: One row of the input col needs to be available in // multiple rows in the output. multiplicatingOutputCols.add(newColName); } if (!newColumns.containsKey(newColName)) newColumns.put(newColName, new TreeSet<>()); // Add all "potentially available" input columns to the newColName. It could be that for a specific repetition, a // child-field is missing, e.g. a[0].c does not exist, but a[1].c does. Nevertheless, we need to reserve some // "space" for a[0].c in the new column a.c, because otherwise the rows of an existing a[0].d will mess up with // the rows of a[1].c, because a.c does contain the values of rows of a[1].c first, but a.d does contain a[0].d // first if (foundPatternIdx == -1) newColumns.get(newColName).add(inputColName); else { // add all eg. a[*].c as input columns, no matter if they exist or not. for (String inputPref : prefixesToReplace.get(foundPatternIdx)) newColumns.get(newColName) .add(inputColName.replaceFirst(Pattern.quote(foundPrefix), inputPref)); } } logger.trace("Will flatten following columns using following input cols (limit): {}", Iterables.limit(newColumns.entrySet(), 100)); logger.trace("Following columns will be multiplicating (limit): {}", Iterables.limit(multiplicatingOutputCols, 100)); // prepare information of single rows: Map<Long, Integer> multiplicationFactorByRowId = new HashMap<>(); // map from input col prefix to rowIds that are not available for all cols starting with that prefix. NavigableMap<String, NavigableSet<Long>> rowIdsNotAvailableForInputCols = new TreeMap<>(); // number of rows that are generated for one of the prefixes created based on the flatten-by value. Example: When // flattening over a[*], this will contain: a[0] -> generates X rows, a[1] -> generates Y rows. Map<String, Integer> numberOfRowsByFlattenedPrefix = new HashMap<>(); for (long inputRowId = inputTableShard.getLowestRowId(); inputRowId < inputTableShard.getLowestRowId() + inputTableShard.getNumberOfRowsInShard(); inputRowId++) { // find the cols of the "flatten-by" field that actually exist for this row. Set<List<String>> colPatterns = patterns.getColumnPatterns(inputRowId); Set<String> mostSpecificColPatterns = // most-specific = the flatten-by field! colPatterns.stream().flatMap(l -> Stream.of(l.get(0))).collect(Collectors.toSet()); // This row will produce this many rows in the output. int numberOfNewRows = mostSpecificColPatterns.size(); multiplicationFactorByRowId.put(inputRowId, numberOfNewRows); mostSpecificColPatterns .forEach(colPattern -> numberOfRowsByFlattenedPrefix.merge(colPattern, 1, Integer::sum)); // This row might not have valid values for all those repeated cols that are available in the Table for the // flatten-by field. Find those columns that are missing. for (String notAvailableColName : Sets.difference(prefixesToReplace.get(0), mostSpecificColPatterns)) { if (!rowIdsNotAvailableForInputCols.containsKey(notAvailableColName)) rowIdsNotAvailableForInputCols.put(notAvailableColName, new TreeSet<>()); rowIdsNotAvailableForInputCols.get(notAvailableColName).add(inputRowId); } } logger.trace("Multiplication factors are the following for all rows (limit): {}", Iterables.limit(multiplicationFactorByRowId.entrySet(), 100)); int maxMultiplicationFactor = multiplicationFactorByRowId.values().stream().mapToInt(Integer::intValue) .max().getAsInt(); // Build new col shards List<StandardColumnShard> flattenedColShards = new ArrayList<>(); for (String newColName : newColumns.keySet()) { long nextFirstRowId = inputTableShard.getLowestRowId(); // find colType by searching an input col that exists and taking the coltype of that one. ColumnType colType = newColumns.get(newColName).stream() .filter(inputColName -> inputTableShard.getColumns().containsKey(inputColName)) .map(inputColName -> inputTableShard.getColumns().get(inputColName).getColumnType()).findAny() .get(); // Collect all the col dictionaries of the input columns: // map from an artificial ID to the dictionary of an input column. The artificial ID is built the following way: // The first dict has artificial ID 0. // The second dict has artificial ID = number of entries in first dict // The third dict has artificial ID = number of entries in second dict // and so on // -> basically every entry in the dict has it's own artificial ID. These must not be overlapping! // The artificial ID is defined in a way so it can be fed to #mergeDicts(.) Map<Long, Dictionary<?>> origColDicts = new HashMap<>(); long nextColAndColDictId = 0L; for (String inputColName : newColumns.get(newColName)) { Dictionary<?> dict; if (inputTableShard.getColumns().containsKey(inputColName)) dict = inputTableShard.getColumns().get(inputColName).getColumnShardDictionary(); else { // assume we had an input col dict for this non-existing col. if (inputColName.endsWith(repeatedColNameGen.lengthIdentifyingSuffix())) // length cols get "0" as default. dict = new ConstantLongDictionary(0L); else dict = createDictionaryWithOnlyDefaultValue(colType); } origColDicts.put(nextColAndColDictId, dict); nextColAndColDictId += dict.getMaxId() + 1; } // merge the input column dicts into the new column dict. Pair<Dictionary<?>, Map<Long, Map<Long, Long>>> mergeDictInfo = mergeDicts(newColName, colType, origColDicts); Dictionary<?> colDict = mergeDictInfo.getLeft(); // new col pages. List<ColumnPage> flattenedColPages = new ArrayList<>(); // we'll use the same counting mechanism that we used fot origColDicts. nextColAndColDictId = 0L; long[] nextPageValues = new long[ColumnShardBuilder.PROPOSAL_ROWS]; int nextPageValueNextIdx = 0; // build col pages for (String inputColName : newColumns.get(newColName)) { long curColId = nextColAndColDictId; Map<Long, Long> columnValueIdChangeMap = mergeDictInfo.getRight().get(curColId); if (!inputTableShard.getColumns().containsKey(inputColName)) { // This col does not exist, therefore we add an "empty" colPage, which resolves statically to the colTypes' // default value. // The size of the page is identified by the number of rows that flattened prefix would have. int noOfRows = -1; for (String prefix : numberOfRowsByFlattenedPrefix.keySet()) { if (inputColName.startsWith(prefix)) { noOfRows = numberOfRowsByFlattenedPrefix.get(prefix); break; } } if (noOfRows == -1) throw new IllegalStateException("Could not find number of rows for empty values."); for (int i = 0; i < noOfRows; i++) { if (nextPageValueNextIdx == nextPageValues.length) { flattenedColPages.add( buildColPageFromValueArray(nextPageValues, -1, nextFirstRowId, newColName)); nextPageValueNextIdx = 0; nextFirstRowId += nextPageValues.length; } nextPageValues[nextPageValueNextIdx++] = columnValueIdChangeMap.get(0L); // constant dict -> always id 0L. } nextColAndColDictId++; // single entry dict! continue; } Dictionary<?> colShardDict = inputTableShard.getColumns().get(inputColName) .getColumnShardDictionary(); nextColAndColDictId += colShardDict.getMaxId() + 1; if (multiplicatingOutputCols.contains(newColName)) { // decompress whole column at once, so we can access it quickly later on. StandardColumnShard inputCol = inputTableShard.getColumns().get(inputColName); Map<Long, Long[]> colValueIds = new HashMap<>(); for (ColumnPage inputPage : inputCol.getPages().values()) { long[] pageValueIds = inputPage.getValues().decompressedArray(); Long[] colValueIdsByRow = inputPage.getColumnPageDict() .decompressValues(LongStream.of(pageValueIds).boxed().toArray(l -> new Long[l])); colValueIds.put(inputPage.getFirstRowId(), colValueIdsByRow); } for (int multiplication = 0; multiplication < maxMultiplicationFactor; multiplication++) for (ColumnPage inputPage : inputTableShard.getColumns().get(inputColName).getPages() .values()) { final int curMultiplicationNo = multiplication; for (int i = 0; i < inputPage.getValues().size(); i++) { Integer thisIndexMultiplicationFactor = multiplicationFactorByRowId .get(inputPage.getFirstRowId() + i); if (thisIndexMultiplicationFactor == null) thisIndexMultiplicationFactor = 1; if (thisIndexMultiplicationFactor > curMultiplicationNo) { // we need to multiplicate this row! if (nextPageValueNextIdx == nextPageValues.length) { flattenedColPages.add(buildColPageFromValueArray(nextPageValues, -1, nextFirstRowId, newColName)); nextPageValueNextIdx = 0; nextFirstRowId += nextPageValues.length; } long origColValueId = colValueIds.get(inputPage.getFirstRowId())[i]; nextPageValues[nextPageValueNextIdx++] = (columnValueIdChangeMap != null) ? columnValueIdChangeMap.get(origColValueId) : origColValueId; } } } } else { for (ColumnPage inputPage : inputTableShard.getColumns().get(inputColName).getPages() .values()) { // decompress whole column page at once, so we can access it quickly later on. long[] pageValueIds = inputPage.getValues().decompressedArray(); Long[] colValueIdsByRow = inputPage.getColumnPageDict() .decompressValues(LongStream.of(pageValueIds).boxed().toArray(l -> new Long[l])); Set<Long> sortedNotAvailableIndices; String interestingPrefix = rowIdsNotAvailableForInputCols.floorKey(inputColName); if (interestingPrefix != null && inputColName.startsWith(interestingPrefix)) { sortedNotAvailableIndices = rowIdsNotAvailableForInputCols.get(interestingPrefix) .subSet(inputPage.getFirstRowId(), inputPage.getFirstRowId() + inputPage.getValues().size()); } else sortedNotAvailableIndices = new HashSet<>(); // peek next unavailable index, works because indices are sorted. PeekingIterator<Long> notAvailableIndicesIt = Iterators .peekingIterator(sortedNotAvailableIndices.iterator()); for (int i = 0; i < inputPage.getValues().size(); i++) { if (notAvailableIndicesIt.hasNext() && notAvailableIndicesIt.peek() == inputPage.getFirstRowId() + i) { notAvailableIndicesIt.next(); continue; } if (nextPageValueNextIdx == nextPageValues.length) { flattenedColPages.add( buildColPageFromValueArray(nextPageValues, -1, nextFirstRowId, newColName)); nextPageValueNextIdx = 0; nextFirstRowId += nextPageValues.length; } long origColValueId = colValueIdsByRow[i]; nextPageValues[nextPageValueNextIdx++] = (columnValueIdChangeMap != null) ? columnValueIdChangeMap.get(origColValueId) : origColValueId; } } } } if (nextPageValueNextIdx > 0) { flattenedColPages.add(buildColPageFromValueArray(nextPageValues, nextPageValueNextIdx, nextFirstRowId, newColName)); nextFirstRowId += nextPageValueNextIdx; nextPageValueNextIdx = 0; } NavigableMap<Long, ColumnPage> navigableFlattenedColPages = new TreeMap<>(); for (ColumnPage flattendColPage : flattenedColPages) navigableFlattenedColPages.put(flattendColPage.getFirstRowId(), flattendColPage); StandardColumnShard flattenedColShard = null; switch (colType) { case STRING: flattenedColShard = columnShardFactory.createStandardStringColumnShard(newColName, navigableFlattenedColPages, (StringDictionary<?>) colDict); break; case LONG: flattenedColShard = columnShardFactory.createStandardLongColumnShard(newColName, navigableFlattenedColPages, (LongDictionary<?>) colDict); break; case DOUBLE: flattenedColShard = columnShardFactory.createStandardDoubleColumnShard(newColName, navigableFlattenedColPages, (DoubleDictionary<?>) colDict); break; } flattenedColShards.add(flattenedColShard); logger.trace("Created flattened column {}", newColName); } TableShard flattenedTableShard = tableFactory.createDefaultTableShard(resultTableName, flattenedColShards); logger.trace("Created flattened table shard " + resultTableName); return flattenedTableShard; } /** * Merges multiple col dicts into one. * * <p> * The input dictionaries are expected to be of type T. T must be {@link Comparable} (which though is no problem for * our values of String, Long, Double). * * @param inputDicts * The col dicts of the input cols, indexed by an artificial "dictionary id" which can be chosen arbitrarily. * @return Pair of merged dictionary and for each input dict ID a mapping map. That map maps from old col dict ID of a * value to the new col dict ID in the merged dict. Map can be empty. */ @SuppressWarnings("unchecked") private <T extends Comparable<T>> Pair<Dictionary<?>, Map<Long, Map<Long, Long>>> mergeDicts(String colName, ColumnType colType, Map<Long, Dictionary<?>> inputDicts) throws IllegalStateException { Map<Long, Map<Long, Long>> resMappingMap = new HashMap<>(); if (inputDicts.size() == 1) { return new Pair<>(inputDicts.values().iterator().next(), resMappingMap); } Map<Long, PeekingIterator<Pair<Long, T>>> iterators = new HashMap<>(); for (Entry<Long, Dictionary<?>> e : inputDicts.entrySet()) { if (e.getValue().getMaxId() == null) continue; iterators.put(e.getKey(), Iterators.peekingIterator(((Dictionary<T>) e.getValue()).iterator())); } // order the next elements of all dicts by their value. // Pair of (Pair of ID in dict and value) and dictId PriorityQueue<Pair<Pair<Long, T>, Long>> nextElements = new PriorityQueue<>( (p1, p2) -> p1.getLeft().getRight().compareTo(p2.getLeft().getRight())); for (Entry<Long, PeekingIterator<Pair<Long, T>>> e : iterators.entrySet()) nextElements.add(new Pair<>(e.getValue().peek(), e.getKey())); // map from value to new ID which will be fed into the dictionary builder. NavigableMap<T, Long> entityMap = new TreeMap<>(); long nextEntityId = 0L; Pair<T, Long> previous = null; // traverse all dictionaries and build mapping list while (!nextElements.isEmpty()) { Pair<Pair<Long, T>, Long> p = nextElements.poll(); Long dictId = p.getRight(); Pair<Long, T> valuePair = p.getLeft(); // move iterator forward iterators.get(dictId).next(); if (iterators.get(dictId).hasNext()) nextElements.add(new Pair<>(iterators.get(dictId).peek(), dictId)); long idInInputDict = valuePair.getLeft(); if (previous == null || valuePair.getRight().compareTo(previous.getLeft()) > 0) { long resultNewId = nextEntityId++; entityMap.put(valuePair.getRight(), resultNewId); previous = new Pair<>(valuePair.getRight(), resultNewId); } if (!resMappingMap.containsKey(dictId)) resMappingMap.put(dictId, new HashMap<>()); resMappingMap.get(dictId).put(idInInputDict, previous.getRight()); } Dictionary<?> resDict = null; Map<Long, Long> builderAdjustMap = null; switch (colType) { case LONG: CompressedLongDictionaryBuilder longBuilder = new CompressedLongDictionaryBuilder(); longBuilder.withDictionaryName(colName).fromEntityMap((NavigableMap<Long, Long>) entityMap); Pair<LongDictionary<?>, Map<Long, Long>> longPair = longBuilder.build(); builderAdjustMap = longPair.getRight(); resDict = longPair.getLeft(); break; case STRING: CompressedStringDictionaryBuilder stringBuilder = new CompressedStringDictionaryBuilder(); stringBuilder.fromEntityMap((NavigableMap<String, Long>) entityMap); Pair<StringDictionary<?>, Map<Long, Long>> stringPair = stringBuilder.build(); builderAdjustMap = stringPair.getRight(); resDict = stringPair.getLeft(); break; case DOUBLE: CompressedDoubleDictionaryBuilder doubleBuilder = new CompressedDoubleDictionaryBuilder(); doubleBuilder.fromEntityMap((NavigableMap<Double, Long>) entityMap); Pair<DoubleDictionary<?>, Map<Long, Long>> doublePair = doubleBuilder.build(); builderAdjustMap = doublePair.getRight(); resDict = doublePair.getLeft(); break; } if (!builderAdjustMap.isEmpty()) throw new IllegalStateException( "IDs of new col dict for col " + colName + " were adjusted although that was not expected!"); return new Pair<Dictionary<?>, Map<Long, Map<Long, Long>>>(resDict, resMappingMap); } /** * Create a new dictionary of the correct type, which will have a single entry at ID 0: the default value for the * given type. */ private Dictionary<?> createDictionaryWithOnlyDefaultValue(ColumnType colType) { switch (colType) { case STRING: return new ConstantStringDictionary(LoaderColumnInfo.DEFAULT_STRING); case LONG: return new ConstantLongDictionary(LoaderColumnInfo.DEFAULT_LONG); case DOUBLE: return new ConstantDoubleDictionary(LoaderColumnInfo.DEFAULT_DOUBLE); } return null; // never happens } /** * Builds a new {@link ColumnPage} from a simple values array. * * @param colPageValues * Contains the actual value the colPage should have for each row. This long array might be changed by this * method and its values are not valid any more upon return of this method. * @param colPageValuesLength * The number of entries in colPageValues array that should actually be used. Use -1 for this param to use * whole colPageValues array. * @param firstRowId * first row ID of resulting {@link ColumnPage}. * @param colName * The name of the column that the new col page will be part of. * @return The new {@link ColumnPage}. */ private ColumnPage buildColPageFromValueArray(long[] colPageValues, int colPageValuesLength, long firstRowId, String colName) { if (colPageValuesLength != -1) { long[] newColPageValues = new long[colPageValuesLength]; System.arraycopy(colPageValues, 0, newColPageValues, 0, colPageValuesLength); colPageValues = newColPageValues; } // create needed "valueMap" from actual value to a temp ID and replace colPageValues with those temp IDs. NavigableMap<Long, Long> valueMap = new TreeMap<>(); long nextFreeTempId = 0L; for (int i = 0; i < colPageValues.length; i++) { if (!valueMap.containsKey(colPageValues[i])) valueMap.put(colPageValues[i], nextFreeTempId++); colPageValues[i] = valueMap.get(colPageValues[i]); } ColumnPageBuilder builder = new ColumnPageBuilder(columnPageFactory); builder.withColumnPageName(colName + "#" + firstRowId).withFirstRowId(firstRowId).withValueMap(valueMap) .withValues(colPageValues); // use same array here, builder will change this array again. return builder.build(); } }