org.diqube.flatten.Flattener.java Source code

Introduction

Here is the source code for org.diqube.flatten.Flattener.java
Source

/**
 * diqube: Distributed Query Base.
 *
 * Copyright (C) 2015 Bastian Gloeckle
 *
 * This file is part of diqube.
 *
 * diqube is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.diqube.flatten;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.UUID;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import java.util.stream.Stream;

import javax.inject.Inject;

import org.diqube.context.AutoInstatiate;
import org.diqube.data.column.ColumnPage;
import org.diqube.data.column.ColumnPageFactory;
import org.diqube.data.column.ColumnShardFactory;
import org.diqube.data.column.ColumnType;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.dictionary.Dictionary;
import org.diqube.data.flatten.FlattenDataFactory;
import org.diqube.data.flatten.FlattenedTable;
import org.diqube.data.table.Table;
import org.diqube.data.table.TableFactory;
import org.diqube.data.table.TableShard;
import org.diqube.data.types.dbl.dict.ConstantDoubleDictionary;
import org.diqube.data.types.dbl.dict.DoubleDictionary;
import org.diqube.data.types.lng.dict.ConstantLongDictionary;
import org.diqube.data.types.lng.dict.LongDictionary;
import org.diqube.data.types.str.dict.ConstantStringDictionary;
import org.diqube.data.types.str.dict.StringDictionary;
import org.diqube.executionenv.querystats.QueryableLongColumnShardFacade;
import org.diqube.executionenv.util.ColumnPatternUtil;
import org.diqube.executionenv.util.ColumnPatternUtil.ColumnPatternContainer;
import org.diqube.executionenv.util.ColumnPatternUtil.LengthColumnMissingException;
import org.diqube.executionenv.util.ColumnPatternUtil.PatternException;
import org.diqube.loader.LoaderColumnInfo;
import org.diqube.loader.columnshard.ColumnPageBuilder;
import org.diqube.loader.columnshard.ColumnShardBuilder;
import org.diqube.loader.compression.CompressedDoubleDictionaryBuilder;
import org.diqube.loader.compression.CompressedLongDictionaryBuilder;
import org.diqube.loader.compression.CompressedStringDictionaryBuilder;
import org.diqube.name.FlattenedTableNameUtil;
import org.diqube.name.RepeatedColumnNameGenerator;
import org.diqube.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.PeekingIterator;
import com.google.common.collect.Sets;

/**
 * Flattens a {@link Table} on a specific (repeated) field, i.e. that for each entry in the repeated field that is
 * denoted by the flatten-by field the resulting table will contain a separate row.
 * 
 * <p>
 * The resulting table will have a different number of rows, as for each index of the repeated field of each row, a new
 * row will be provided.
 * 
 * <p>
 * Example input table with two rows and a nested array:
 * 
 * <pre>
 * { a : [ { b : 1 },
 *         { b : 2 } ],
 *   c : 9 },
 * { a : [ { b : 3 },
 *         { b : 4 } ],
 *   c : 10}
 * </pre>
 * 
 * When flattenning this over "a[*]", all elements in the a[.] array are separated into a single row (= table with 4
 * rows):
 * 
 * <pre>
 * { a.b : 1, c : 9 },
 * { a.b : 2, c : 9 },
 * { a.b : 3, c : 10 },
 * { a.b : 4, c : 10 }
 * </pre>
 * 
 * <p>
 * Note that values are not validated anyhow. That means that if a specific entry in the array did not have all fields
 * defined, those non-defined fields will be non-defined in the resulting rows. TODO #14: Support optional columns.
 * 
 * @author Bastian Gloeckle
 */
@AutoInstatiate
public class Flattener {
    private static final Logger logger = LoggerFactory.getLogger(Flattener.class);

    @Inject
    private FlattenDataFactory factory;

    @Inject
    private RepeatedColumnNameGenerator repeatedColNameGen;

    @Inject
    private FlattenedTableNameUtil flattenedTableNameGen;

    @Inject
    private ColumnPatternUtil colPatternUtil;

    @Inject
    private ColumnPageFactory columnPageFactory;

    @Inject
    private ColumnShardFactory columnShardFactory;

    @Inject
    private TableFactory tableFactory;

    /**
     * Flatten the given table by the given flatten-by field, returning a premilinary flattened table (see below).
     * 
     * <p>
     * For details, see class doc.
     * 
     * <p>
     * For each TableShard, one new {@link TableShard} will be created. Note that each flattened table shard will have the
     * same firstRowId as the corresponding input table Shard - although the flattened shards will usually contain more
     * rows. This means that most probably the rowIds will be overlapping in the returned flattenedTable! <b>This needs to
     * be fixed after calling this method, otherwise the Table is not usable!</b>. Typically a table is spread over
     * multiple cluster nodes, which means that fixing the firstRowIds requires communicating with the other nodes,
     * therefore this util class does not take care of this.
     * 
     * @param inputTable
     *          The table that should be flattened. This cannot be an already flattened table.
     * @param inputTableShards
     *          Specify the tableShards to work on. If this is not set (== <code>null</code>), then the tableShards will
     *          be read from the inputTable.
     * @param flattenByField
     *          The field which should be flattened by, in the usual "all-array-notation" as defined in
     *          {@link RepeatedColumnNameGenerator} (e.g. a[*].c.b[*] to get a single row for each index in all the "b"
     *          arrays in a[*].c).
     * @param flattenId
     *          The ID of the flattening that should be used to generate the output table name.
     * @return The flattened table.
     * @throws IllegalArgumentException
     *           If a passed argument is invalid.
     * @throws PatternException
     *           If the flattenByField pattern was not recognized.
     * @throws LengthColumnMissingException
     *           If any required "length" col is missing.
     * @throws IllegalStateException
     *           If the table cannot be flattened for any reason.
     */
    public FlattenedTable flattenTable(Table inputTable, Collection<TableShard> inputTableShards,
            String flattenByField, UUID flattenId)
            throws IllegalArgumentException, IllegalStateException, PatternException, LengthColumnMissingException {
        if (inputTable instanceof FlattenedTable)
            throw new IllegalArgumentException("Cannot flatten an already flattened table.");

        if (!flattenByField.endsWith(repeatedColNameGen.allEntriesIdentifyingSubstr()))
            throw new IllegalArgumentException("Flatten-By field does not end with '"
                    + repeatedColNameGen.allEntriesIdentifyingSubstr() + "'");

        String resultTableName = flattenedTableNameGen.createFlattenedTableName(inputTable.getName(),
                flattenByField, flattenId);

        if (inputTableShards == null)
            inputTableShards = inputTable.getShards();

        List<TableShard> flattenedTableShards = new ArrayList<>();
        for (TableShard shard : inputTableShards)
            flattenedTableShards.add(flattenTableShard(resultTableName, shard, flattenByField));

        Set<Long> firstRowIdsOfInputShards = inputTableShards.stream().map(shard -> shard.getLowestRowId())
                .collect(Collectors.toSet());

        return factory.createFlattenedTable(resultTableName, flattenedTableShards, firstRowIdsOfInputShards);
    }

    /**
     * Flattens a single {@link TableShard}.
     * 
     * <p>
     * This works as follows:
     * 
     * <ol>
     * <li>Find all patterns the flatten-by-field pattern matches to. These are then the prefixes of the column names of
     * which a new row will be created.
     * <li>Also find the names of the length columns of these patterns.
     * <li>Produce a to-do list: What is the name of the output columns and what input columns is that output column
     * created from?
     * <ul>
     * <li>Is the new column a "multiplicating col"? These cols are cols that are outside of the path of the repeated
     * column that is flattened over. Nevertheless each input col contains a value for that row: A single row-value of the
     * input columns needs to be available for multiple cols on the output table.
     * <li>Remove previously found length-columns from to-be-created col list (when flattening over a[*] we do not want a
     * a[length] column to appear in the output!).
     * </ul>
     * <li>Iterate over all rows of the input col and identify for each row and identify (1) how many output rows that row
     * will create (taking into account the length columns of the flatten-by field in that row) and (2) if this row is
     * missing of any child-fields (i.e. there is an array a[*].c[*], when flattening over a[*], there are output cols
     * a.c[0], a.c[1], a.c[2], but it could be that a specific row does not contain a.c[2], because that row simply does
     * not have that many entries in the array.
     * <li>Build the new columns - each new column can be either "multiplicating" (see above), in which case the col pages
     * are repeated accordingly (and no-longer repeated rows are removed from the repeated colpages) or they can be
     * "flattned" - in which case the col is a sub-field of the flattened one and we only need to remove rows that do not
     * contain any value.
     * </ol>
     * 
     * We need to ensure that we do not mess up with the row-ordering of the various output columns: Each output column
     * needs to have the same number of rows and the rowIds need to match correctly. Therefore, when creating a column
     * e.g. based on inputColumns where we do not have realized all, we need to insert "constant" column pages into the
     * output which will then resolve to default values. Example:
     * 
     * Source table:
     * 
     * <pre>
     * {a:[ { b:[1] },
     *      { b:[2, 3] }]},
     * {a:[ { b:[4] },
     *      { b:[5, 6] }]}
     * </pre>
     * 
     * In this example, there will be no column a[0].b[1] in the input (as all a[0]s only have at max a single entry in
     * .b). Would we now map new columns to col pages of old columns in the following way (flattened over a[*]; displayed
     * is the list of col pages that are consecutively accessed for a new column):
     * 
     * <pre>
     * a.b[0] = [ all col pages of a[0].b[0] ]
     * a.b[1] = [ all col pages of a[0].b[1], all col pages of a[1].b[1] ]
     * a.b[length] = [ all col pages of a[0].b[length], all col pages of a[1].b[length] ]
     * </pre>
     * 
     * .. in that way we would mess up as a.b[0] would have less rows than a.b[1] -> we need to add a "constant" colPage
     * to a.b[0] to resolve to a default value. Note that we nevertheless will probably never resolve those default values
     * (at least in this example) as the a.b[length] value will not allow us to iterate that far in the corresponding
     * rows.
     * 
     * <p>
     * Note that the resulting TableShard will have the same first Row ID as the input TableShard. If multiple TableShards
     * of the same table are flattened (this is usually the case), then after flattening them, the row IDs might overlap
     * (since every TableShard has the original firstRow ID, but each table shard contains more rows). The rowIds need to
     * be adjusted afterwards!.
     */
    private TableShard flattenTableShard(String resultTableName, TableShard inputTableShard, String flattenByField)
            throws PatternException, LengthColumnMissingException, IllegalStateException {
        String[] flattenFieldSplit = flattenByField
                .split(Pattern.quote(repeatedColNameGen.allEntriesIdentifyingSubstr() + "."));
        List<String> repeatedFieldsAlongPath = new ArrayList<>();
        String prev = "";
        for (String splitPart : flattenFieldSplit) {
            if (!"".equals(prev))
                prev += ".";

            prev += splitPart;
            if (!splitPart.endsWith(repeatedColNameGen.allEntriesIdentifyingSubstr()))
                prev += repeatedColNameGen.allEntriesIdentifyingSubstr();

            repeatedFieldsAlongPath.add(prev);
        }

        // calculate the most specific patterns first - colPatternUtil will return its lists in the same ordering!
        repeatedFieldsAlongPath = Lists.reverse(repeatedFieldsAlongPath);

        Set<String> allInputLengthColsOfFlattenedFields = new HashSet<>();

        ColumnPatternContainer patterns = colPatternUtil.findColNamesForColNamePattern(lengthColName -> {
            allInputLengthColsOfFlattenedFields.add(lengthColName);
            return new QueryableLongColumnShardFacade(inputTableShard.getLongColumns().get(lengthColName));
        }, repeatedFieldsAlongPath);

        // transpose result of colPatternUtil: Collect all the most specific patterns in a set, then the second-most
        // specific patterns etc.
        // Later we want to first check if a colname matches one of the most specfic patterns as prefix and replace that,
        // before checking if it matches some less-specific patterns.
        List<Set<String>> prefixesToReplace = new ArrayList<>();
        for (int i = 0; i < repeatedFieldsAlongPath.size(); i++)
            prefixesToReplace.add(new HashSet<>());
        for (List<String> patternList : patterns.getMaximumColumnPatterns()) {
            for (int i = 0; i < patternList.size(); i++)
                prefixesToReplace.get(i).add(patternList.get(i));
        }

        // Prefix replacements based on index in prefixesToReplace: If a prefix of prefixesToReplace.get(0) is found, that
        // prefix needs to be replaced by replacements.get(0).
        List<String> replacements = repeatedFieldsAlongPath.stream().map(
                pattern -> pattern.replaceAll(Pattern.quote(repeatedColNameGen.allEntriesIdentifyingSubstr()), ""))
                .collect(Collectors.toList());

        // map from new column name to input column names that column is based upon. Note that input col names might not
        // exist in inputTableShard, see comments below when newColumn is filled.
        Map<String, SortedSet<String>> newColumns = new HashMap<>();
        // output cols whose row-values are based on using input cols values and each row value of those inputs is the value
        // of multiple output cols
        Set<String> multiplicatingOutputCols = new HashSet<>();

        Set<String> allInputColNames = inputTableShard.getColumns().keySet();

        for (String inputColName : allInputColNames) {
            if (allInputLengthColsOfFlattenedFields.contains(inputColName))
                // Remove certian length columns from the set of to-be-created columns. For example when flattenning over a[*],
                // we do not want to create a[length] column, as it simply does not make sense any more as each of the entries
                // in a[*] is now a separate row.
                continue;

            String newColName = null;
            String foundPrefix = null;
            int foundPatternIdx = -1;
            for (int patternIdx = 0; patternIdx < prefixesToReplace.size(); patternIdx++) {
                Set<String> prefixes = prefixesToReplace.get(patternIdx);
                for (String prefix : prefixes) {
                    if (inputColName.startsWith(prefix)) {
                        newColName = inputColName.replaceFirst(Pattern.quote(prefix), replacements.get(patternIdx));
                        foundPrefix = prefix;
                        foundPatternIdx = patternIdx;
                        if (patternIdx > 0)
                            // not the first list of prefixes matched (= created from pattern equalling the "flatten-by"), but
                            // less-specific patterns matched. That means that this column needs to act in a way, that the value of
                            // one input row needs to be projected to multiple rows on the output side.
                            // Example: matched: a[0], but flattened over a[*].b[*]
                            multiplicatingOutputCols.add(newColName);
                        break;
                    }
                }
                if (newColName != null)
                    break;
            }

            if (newColName == null) {
                // no replacement found, this column is on different path than the flattened one, do not flatten, do not
                // replace.
                newColName = inputColName;
                // At the same time, this column needs to be multiplied: One row of the input col needs to be available in
                // multiple rows in the output.
                multiplicatingOutputCols.add(newColName);
            }

            if (!newColumns.containsKey(newColName))
                newColumns.put(newColName, new TreeSet<>());

            // Add all "potentially available" input columns to the newColName. It could be that for a specific repetition, a
            // child-field is missing, e.g. a[0].c does not exist, but a[1].c does. Nevertheless, we need to reserve some
            // "space" for a[0].c in the new column a.c, because otherwise the rows of an existing a[0].d will mess up with
            // the rows of a[1].c, because a.c does contain the values of rows of a[1].c first, but a.d does contain a[0].d
            // first
            if (foundPatternIdx == -1)
                newColumns.get(newColName).add(inputColName);
            else {
                // add all eg. a[*].c as input columns, no matter if they exist or not.
                for (String inputPref : prefixesToReplace.get(foundPatternIdx))
                    newColumns.get(newColName)
                            .add(inputColName.replaceFirst(Pattern.quote(foundPrefix), inputPref));
            }
        }

        logger.trace("Will flatten following columns using following input cols (limit): {}",
                Iterables.limit(newColumns.entrySet(), 100));
        logger.trace("Following columns will be multiplicating (limit): {}",
                Iterables.limit(multiplicatingOutputCols, 100));

        // prepare information of single rows:

        Map<Long, Integer> multiplicationFactorByRowId = new HashMap<>();
        // map from input col prefix to rowIds that are not available for all cols starting with that prefix.
        NavigableMap<String, NavigableSet<Long>> rowIdsNotAvailableForInputCols = new TreeMap<>();

        // number of rows that are generated for one of the prefixes created based on the flatten-by value. Example: When
        // flattening over a[*], this will contain: a[0] -> generates X rows, a[1] -> generates Y rows.
        Map<String, Integer> numberOfRowsByFlattenedPrefix = new HashMap<>();

        for (long inputRowId = inputTableShard.getLowestRowId(); inputRowId < inputTableShard.getLowestRowId()
                + inputTableShard.getNumberOfRowsInShard(); inputRowId++) {

            // find the cols of the "flatten-by" field that actually exist for this row.
            Set<List<String>> colPatterns = patterns.getColumnPatterns(inputRowId);
            Set<String> mostSpecificColPatterns = // most-specific = the flatten-by field!
                    colPatterns.stream().flatMap(l -> Stream.of(l.get(0))).collect(Collectors.toSet());

            // This row will produce this many rows in the output.
            int numberOfNewRows = mostSpecificColPatterns.size();
            multiplicationFactorByRowId.put(inputRowId, numberOfNewRows);
            mostSpecificColPatterns
                    .forEach(colPattern -> numberOfRowsByFlattenedPrefix.merge(colPattern, 1, Integer::sum));

            // This row might not have valid values for all those repeated cols that are available in the Table for the
            // flatten-by field. Find those columns that are missing.
            for (String notAvailableColName : Sets.difference(prefixesToReplace.get(0), mostSpecificColPatterns)) {
                if (!rowIdsNotAvailableForInputCols.containsKey(notAvailableColName))
                    rowIdsNotAvailableForInputCols.put(notAvailableColName, new TreeSet<>());
                rowIdsNotAvailableForInputCols.get(notAvailableColName).add(inputRowId);
            }
        }

        logger.trace("Multiplication factors are the following for all rows (limit): {}",
                Iterables.limit(multiplicationFactorByRowId.entrySet(), 100));

        int maxMultiplicationFactor = multiplicationFactorByRowId.values().stream().mapToInt(Integer::intValue)
                .max().getAsInt();

        // Build new col shards
        List<StandardColumnShard> flattenedColShards = new ArrayList<>();
        for (String newColName : newColumns.keySet()) {
            long nextFirstRowId = inputTableShard.getLowestRowId();

            // find colType by searching an input col that exists and taking the coltype of that one.
            ColumnType colType = newColumns.get(newColName).stream()
                    .filter(inputColName -> inputTableShard.getColumns().containsKey(inputColName))
                    .map(inputColName -> inputTableShard.getColumns().get(inputColName).getColumnType()).findAny()
                    .get();

            // Collect all the col dictionaries of the input columns:
            // map from an artificial ID to the dictionary of an input column. The artificial ID is built the following way:
            // The first dict has artificial ID 0.
            // The second dict has artificial ID = number of entries in first dict
            // The third dict has artificial ID = number of entries in second dict
            // and so on
            // -> basically every entry in the dict has it's own artificial ID. These must not be overlapping!
            // The artificial ID is defined in a way so it can be fed to #mergeDicts(.)
            Map<Long, Dictionary<?>> origColDicts = new HashMap<>();
            long nextColAndColDictId = 0L;
            for (String inputColName : newColumns.get(newColName)) {
                Dictionary<?> dict;
                if (inputTableShard.getColumns().containsKey(inputColName))
                    dict = inputTableShard.getColumns().get(inputColName).getColumnShardDictionary();
                else {
                    // assume we had an input col dict for this non-existing col.
                    if (inputColName.endsWith(repeatedColNameGen.lengthIdentifyingSuffix()))
                        // length cols get "0" as default.
                        dict = new ConstantLongDictionary(0L);
                    else
                        dict = createDictionaryWithOnlyDefaultValue(colType);
                }

                origColDicts.put(nextColAndColDictId, dict);
                nextColAndColDictId += dict.getMaxId() + 1;
            }

            // merge the input column dicts into the new column dict.
            Pair<Dictionary<?>, Map<Long, Map<Long, Long>>> mergeDictInfo = mergeDicts(newColName, colType,
                    origColDicts);
            Dictionary<?> colDict = mergeDictInfo.getLeft();

            // new col pages.
            List<ColumnPage> flattenedColPages = new ArrayList<>();

            // we'll use the same counting mechanism that we used fot origColDicts.
            nextColAndColDictId = 0L;

            long[] nextPageValues = new long[ColumnShardBuilder.PROPOSAL_ROWS];
            int nextPageValueNextIdx = 0;

            // build col pages
            for (String inputColName : newColumns.get(newColName)) {
                long curColId = nextColAndColDictId;

                Map<Long, Long> columnValueIdChangeMap = mergeDictInfo.getRight().get(curColId);

                if (!inputTableShard.getColumns().containsKey(inputColName)) {
                    // This col does not exist, therefore we add an "empty" colPage, which resolves statically to the colTypes'
                    // default value.

                    // The size of the page is identified by the number of rows that flattened prefix would have.
                    int noOfRows = -1;
                    for (String prefix : numberOfRowsByFlattenedPrefix.keySet()) {
                        if (inputColName.startsWith(prefix)) {
                            noOfRows = numberOfRowsByFlattenedPrefix.get(prefix);
                            break;
                        }
                    }
                    if (noOfRows == -1)
                        throw new IllegalStateException("Could not find number of rows for empty values.");

                    for (int i = 0; i < noOfRows; i++) {
                        if (nextPageValueNextIdx == nextPageValues.length) {
                            flattenedColPages.add(
                                    buildColPageFromValueArray(nextPageValues, -1, nextFirstRowId, newColName));
                            nextPageValueNextIdx = 0;
                            nextFirstRowId += nextPageValues.length;
                        }
                        nextPageValues[nextPageValueNextIdx++] = columnValueIdChangeMap.get(0L); // constant dict -> always id 0L.
                    }

                    nextColAndColDictId++; // single entry dict!

                    continue;
                }

                Dictionary<?> colShardDict = inputTableShard.getColumns().get(inputColName)
                        .getColumnShardDictionary();
                nextColAndColDictId += colShardDict.getMaxId() + 1;

                if (multiplicatingOutputCols.contains(newColName)) {
                    // decompress whole column at once, so we can access it quickly later on.
                    StandardColumnShard inputCol = inputTableShard.getColumns().get(inputColName);
                    Map<Long, Long[]> colValueIds = new HashMap<>();
                    for (ColumnPage inputPage : inputCol.getPages().values()) {
                        long[] pageValueIds = inputPage.getValues().decompressedArray();
                        Long[] colValueIdsByRow = inputPage.getColumnPageDict()
                                .decompressValues(LongStream.of(pageValueIds).boxed().toArray(l -> new Long[l]));
                        colValueIds.put(inputPage.getFirstRowId(), colValueIdsByRow);
                    }

                    for (int multiplication = 0; multiplication < maxMultiplicationFactor; multiplication++)
                        for (ColumnPage inputPage : inputTableShard.getColumns().get(inputColName).getPages()
                                .values()) {
                            final int curMultiplicationNo = multiplication;
                            for (int i = 0; i < inputPage.getValues().size(); i++) {
                                Integer thisIndexMultiplicationFactor = multiplicationFactorByRowId
                                        .get(inputPage.getFirstRowId() + i);
                                if (thisIndexMultiplicationFactor == null)
                                    thisIndexMultiplicationFactor = 1;

                                if (thisIndexMultiplicationFactor > curMultiplicationNo) {
                                    // we need to multiplicate this row!
                                    if (nextPageValueNextIdx == nextPageValues.length) {
                                        flattenedColPages.add(buildColPageFromValueArray(nextPageValues, -1,
                                                nextFirstRowId, newColName));
                                        nextPageValueNextIdx = 0;
                                        nextFirstRowId += nextPageValues.length;
                                    }
                                    long origColValueId = colValueIds.get(inputPage.getFirstRowId())[i];
                                    nextPageValues[nextPageValueNextIdx++] = (columnValueIdChangeMap != null)
                                            ? columnValueIdChangeMap.get(origColValueId)
                                            : origColValueId;
                                }
                            }
                        }
                } else {
                    for (ColumnPage inputPage : inputTableShard.getColumns().get(inputColName).getPages()
                            .values()) {
                        // decompress whole column page at once, so we can access it quickly later on.
                        long[] pageValueIds = inputPage.getValues().decompressedArray();
                        Long[] colValueIdsByRow = inputPage.getColumnPageDict()
                                .decompressValues(LongStream.of(pageValueIds).boxed().toArray(l -> new Long[l]));

                        Set<Long> sortedNotAvailableIndices;
                        String interestingPrefix = rowIdsNotAvailableForInputCols.floorKey(inputColName);
                        if (interestingPrefix != null && inputColName.startsWith(interestingPrefix)) {
                            sortedNotAvailableIndices = rowIdsNotAvailableForInputCols.get(interestingPrefix)
                                    .subSet(inputPage.getFirstRowId(),
                                            inputPage.getFirstRowId() + inputPage.getValues().size());
                        } else
                            sortedNotAvailableIndices = new HashSet<>();

                        // peek next unavailable index, works because indices are sorted.
                        PeekingIterator<Long> notAvailableIndicesIt = Iterators
                                .peekingIterator(sortedNotAvailableIndices.iterator());
                        for (int i = 0; i < inputPage.getValues().size(); i++) {
                            if (notAvailableIndicesIt.hasNext()
                                    && notAvailableIndicesIt.peek() == inputPage.getFirstRowId() + i) {
                                notAvailableIndicesIt.next();
                                continue;
                            }

                            if (nextPageValueNextIdx == nextPageValues.length) {
                                flattenedColPages.add(
                                        buildColPageFromValueArray(nextPageValues, -1, nextFirstRowId, newColName));
                                nextPageValueNextIdx = 0;
                                nextFirstRowId += nextPageValues.length;
                            }
                            long origColValueId = colValueIdsByRow[i];
                            nextPageValues[nextPageValueNextIdx++] = (columnValueIdChangeMap != null)
                                    ? columnValueIdChangeMap.get(origColValueId)
                                    : origColValueId;
                        }
                    }
                }
            }

            if (nextPageValueNextIdx > 0) {
                flattenedColPages.add(buildColPageFromValueArray(nextPageValues, nextPageValueNextIdx,
                        nextFirstRowId, newColName));
                nextFirstRowId += nextPageValueNextIdx;
                nextPageValueNextIdx = 0;
            }

            NavigableMap<Long, ColumnPage> navigableFlattenedColPages = new TreeMap<>();
            for (ColumnPage flattendColPage : flattenedColPages)
                navigableFlattenedColPages.put(flattendColPage.getFirstRowId(), flattendColPage);

            StandardColumnShard flattenedColShard = null;
            switch (colType) {
            case STRING:
                flattenedColShard = columnShardFactory.createStandardStringColumnShard(newColName,
                        navigableFlattenedColPages, (StringDictionary<?>) colDict);
                break;
            case LONG:
                flattenedColShard = columnShardFactory.createStandardLongColumnShard(newColName,
                        navigableFlattenedColPages, (LongDictionary<?>) colDict);
                break;
            case DOUBLE:
                flattenedColShard = columnShardFactory.createStandardDoubleColumnShard(newColName,
                        navigableFlattenedColPages, (DoubleDictionary<?>) colDict);
                break;
            }

            flattenedColShards.add(flattenedColShard);

            logger.trace("Created flattened column {}", newColName);
        }

        TableShard flattenedTableShard = tableFactory.createDefaultTableShard(resultTableName, flattenedColShards);

        logger.trace("Created flattened table shard " + resultTableName);

        return flattenedTableShard;
    }

    /**
     * Merges multiple col dicts into one.
     * 
     * <p>
     * The input dictionaries are expected to be of type T. T must be {@link Comparable} (which though is no problem for
     * our values of String, Long, Double).
     * 
     * @param inputDicts
     *          The col dicts of the input cols, indexed by an artificial "dictionary id" which can be chosen arbitrarily.
     * @return Pair of merged dictionary and for each input dict ID a mapping map. That map maps from old col dict ID of a
     *         value to the new col dict ID in the merged dict. Map can be empty.
     */
    @SuppressWarnings("unchecked")
    private <T extends Comparable<T>> Pair<Dictionary<?>, Map<Long, Map<Long, Long>>> mergeDicts(String colName,
            ColumnType colType, Map<Long, Dictionary<?>> inputDicts) throws IllegalStateException {
        Map<Long, Map<Long, Long>> resMappingMap = new HashMap<>();

        if (inputDicts.size() == 1) {
            return new Pair<>(inputDicts.values().iterator().next(), resMappingMap);
        }

        Map<Long, PeekingIterator<Pair<Long, T>>> iterators = new HashMap<>();
        for (Entry<Long, Dictionary<?>> e : inputDicts.entrySet()) {
            if (e.getValue().getMaxId() == null)
                continue;
            iterators.put(e.getKey(), Iterators.peekingIterator(((Dictionary<T>) e.getValue()).iterator()));
        }

        // order the next elements of all dicts by their value.
        // Pair of (Pair of ID in dict and value) and dictId
        PriorityQueue<Pair<Pair<Long, T>, Long>> nextElements = new PriorityQueue<>(
                (p1, p2) -> p1.getLeft().getRight().compareTo(p2.getLeft().getRight()));

        for (Entry<Long, PeekingIterator<Pair<Long, T>>> e : iterators.entrySet())
            nextElements.add(new Pair<>(e.getValue().peek(), e.getKey()));

        // map from value to new ID which will be fed into the dictionary builder.
        NavigableMap<T, Long> entityMap = new TreeMap<>();
        long nextEntityId = 0L;

        Pair<T, Long> previous = null;

        // traverse all dictionaries and build mapping list
        while (!nextElements.isEmpty()) {
            Pair<Pair<Long, T>, Long> p = nextElements.poll();
            Long dictId = p.getRight();
            Pair<Long, T> valuePair = p.getLeft();

            // move iterator forward
            iterators.get(dictId).next();
            if (iterators.get(dictId).hasNext())
                nextElements.add(new Pair<>(iterators.get(dictId).peek(), dictId));

            long idInInputDict = valuePair.getLeft();
            if (previous == null || valuePair.getRight().compareTo(previous.getLeft()) > 0) {
                long resultNewId = nextEntityId++;

                entityMap.put(valuePair.getRight(), resultNewId);

                previous = new Pair<>(valuePair.getRight(), resultNewId);
            }

            if (!resMappingMap.containsKey(dictId))
                resMappingMap.put(dictId, new HashMap<>());
            resMappingMap.get(dictId).put(idInInputDict, previous.getRight());
        }

        Dictionary<?> resDict = null;
        Map<Long, Long> builderAdjustMap = null;
        switch (colType) {
        case LONG:
            CompressedLongDictionaryBuilder longBuilder = new CompressedLongDictionaryBuilder();
            longBuilder.withDictionaryName(colName).fromEntityMap((NavigableMap<Long, Long>) entityMap);
            Pair<LongDictionary<?>, Map<Long, Long>> longPair = longBuilder.build();
            builderAdjustMap = longPair.getRight();
            resDict = longPair.getLeft();
            break;
        case STRING:
            CompressedStringDictionaryBuilder stringBuilder = new CompressedStringDictionaryBuilder();
            stringBuilder.fromEntityMap((NavigableMap<String, Long>) entityMap);
            Pair<StringDictionary<?>, Map<Long, Long>> stringPair = stringBuilder.build();
            builderAdjustMap = stringPair.getRight();
            resDict = stringPair.getLeft();
            break;
        case DOUBLE:
            CompressedDoubleDictionaryBuilder doubleBuilder = new CompressedDoubleDictionaryBuilder();
            doubleBuilder.fromEntityMap((NavigableMap<Double, Long>) entityMap);
            Pair<DoubleDictionary<?>, Map<Long, Long>> doublePair = doubleBuilder.build();
            builderAdjustMap = doublePair.getRight();
            resDict = doublePair.getLeft();
            break;
        }

        if (!builderAdjustMap.isEmpty())
            throw new IllegalStateException(
                    "IDs of new col dict for col " + colName + " were adjusted although that was not expected!");

        return new Pair<Dictionary<?>, Map<Long, Map<Long, Long>>>(resDict, resMappingMap);
    }

    /**
     * Create a new dictionary of the correct type, which will have a single entry at ID 0: the default value for the
     * given type.
     */
    private Dictionary<?> createDictionaryWithOnlyDefaultValue(ColumnType colType) {
        switch (colType) {
        case STRING:
            return new ConstantStringDictionary(LoaderColumnInfo.DEFAULT_STRING);
        case LONG:
            return new ConstantLongDictionary(LoaderColumnInfo.DEFAULT_LONG);
        case DOUBLE:
            return new ConstantDoubleDictionary(LoaderColumnInfo.DEFAULT_DOUBLE);
        }
        return null; // never happens
    }

    /**
     * Builds a new {@link ColumnPage} from a simple values array.
     * 
     * @param colPageValues
     *          Contains the actual value the colPage should have for each row. This long array might be changed by this
     *          method and its values are not valid any more upon return of this method.
     * @param colPageValuesLength
     *          The number of entries in colPageValues array that should actually be used. Use -1 for this param to use
     *          whole colPageValues array.
     * @param firstRowId
     *          first row ID of resulting {@link ColumnPage}.
     * @param colName
     *          The name of the column that the new col page will be part of.
     * @return The new {@link ColumnPage}.
     */
    private ColumnPage buildColPageFromValueArray(long[] colPageValues, int colPageValuesLength, long firstRowId,
            String colName) {
        if (colPageValuesLength != -1) {
            long[] newColPageValues = new long[colPageValuesLength];
            System.arraycopy(colPageValues, 0, newColPageValues, 0, colPageValuesLength);
            colPageValues = newColPageValues;
        }

        // create needed "valueMap" from actual value to a temp ID and replace colPageValues with those temp IDs.
        NavigableMap<Long, Long> valueMap = new TreeMap<>();
        long nextFreeTempId = 0L;
        for (int i = 0; i < colPageValues.length; i++) {
            if (!valueMap.containsKey(colPageValues[i]))
                valueMap.put(colPageValues[i], nextFreeTempId++);
            colPageValues[i] = valueMap.get(colPageValues[i]);
        }

        ColumnPageBuilder builder = new ColumnPageBuilder(columnPageFactory);
        builder.withColumnPageName(colName + "#" + firstRowId).withFirstRowId(firstRowId).withValueMap(valueMap)
                .withValues(colPageValues); // use same array here, builder will change this array again.

        return builder.build();
    }

}