com.linkedin.cubert.block.CreateBlockOperator.java Source code

Introduction

Here is the source code for com.linkedin.cubert.block.CreateBlockOperator.java
Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.block;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;

import com.linkedin.cubert.operator.BlockOperator;
import com.linkedin.cubert.operator.PhaseContext;
import com.linkedin.cubert.operator.PostCondition;
import com.linkedin.cubert.operator.PreconditionException;
import com.linkedin.cubert.utils.CommonUtils;
import com.linkedin.cubert.utils.DefaultSortAlgo;
import com.linkedin.cubert.utils.FileCache;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.SerializedTupleStore;
import com.linkedin.cubert.utils.TupleUtils;

/**
 * A BlockOperator that creates blocks.
 * 
 * @author Maneesh Varshney
 * 
 */
public class CreateBlockOperator implements BlockOperator {

    private static class CostFunction {
        protected final long blockgenValue;
        protected final BlockgenType type;

        CostFunction(long blockgenValue, BlockgenType type) {
            this.blockgenValue = blockgenValue;
            this.type = type;
        }

        protected boolean isCostExceeded(long numTuples, long numPartitionKeys, long storeSize)

        {
            switch (type) {
            case BY_BLOCK_ID:
                return numPartitionKeys > 0;
            case BY_INDEX:
                return numPartitionKeys > 0;
            case BY_PARTITION_KEY:
                return numPartitionKeys >= blockgenValue;
            case BY_ROW:
                return numTuples >= blockgenValue;
            case BY_SIZE:
                return storeSize >= blockgenValue;
            case USER_DEFINED:
                break;
            default:
                break;

            }
            return false;
        }
    }

    static final class StoredBlock implements Block {
        private final PivotedBlock input;
        private Iterator<Tuple> iterator;
        private int numPartitionKeys;
        private final SerializedTupleStore store;
        private final BlockProperties props;

        StoredBlock(PivotedBlock input, String[] partitionKeys, String[] sortKeys, CostFunction costFunction)
                throws IOException, InterruptedException {
            this.input = input;
            BlockSchema inputSchema = input.getProperties().getSchema();
            props = new BlockProperties("", inputSchema, input.getProperties());
            props.setBlockId(-1);

            // if no need for sorting, then use the simpler SerializedTupleStore (that
            // does not maintain offsetList)
            if (sortKeys == null)
                store = new SerializedTupleStore(inputSchema);
            else
                store = new SerializedTupleStore(inputSchema, sortKeys);

            // fetch the first tuple.. this will be the partition key
            Tuple tuple = input.next();

            if (tuple == null) {
                // If there is no input data, set "empty iterator"
                iterator = new Iterator<Tuple>() {
                    @Override
                    public boolean hasNext() {
                        return false;
                    }

                    @Override
                    public Tuple next() {
                        return null;
                    }

                    @Override
                    public void remove() {
                    }

                };

                return;
            }
            Tuple partitionKey = TupleUtils.extractTuple(tuple, inputSchema, partitionKeys);
            props.setPartitionKey(partitionKey);

            store.addToStore(tuple);

            while (true) {
                while ((tuple = input.next()) != null)
                    store.addToStore(tuple);

                numPartitionKeys++;

                if (costFunction.isCostExceeded(store.getNumTuples(), numPartitionKeys, store.size()))
                    break;

                if (!input.advancePivot())
                    break;

            }

            System.out.println("Store size is " + store.size());

            if (sortKeys != null)
                store.sort(new DefaultSortAlgo<Tuple>());

            iterator = store.iterator();
        }

        @Override
        public void configure(JsonNode json) throws IOException, InterruptedException {

        }

        @Override
        public Tuple next() throws IOException, InterruptedException {
            if (iterator.hasNext())
                return iterator.next();
            else {
                store.clear();
                return null;
            }
        }

        @Override
        public void rewind() throws IOException {

        }

        @Override
        public BlockProperties getProperties() {
            return props;
        }

    }

    static class StreamingBlock implements Block {
        protected BlockProperties props;
        protected final PivotedBlock input;
        private long numPartitionKeys;
        private long numTuples;
        protected Tuple firstTuple;
        private boolean isFirstTuple = true;
        private final CostFunction costFunction;

        StreamingBlock(PivotedBlock input, String[] partitionKeys, CostFunction costFunction)
                throws IOException, InterruptedException {
            this.input = input;
            this.costFunction = costFunction;

            BlockSchema inputSchema = input.getProperties().getSchema();
            props = new BlockProperties("", inputSchema, input.getProperties());
            props.setBlockId(-1);

            firstTuple = input.next();
            if (firstTuple == null) {
                return;
            }
            Tuple partitionKey = TupleUtils.extractTuple(firstTuple, inputSchema, partitionKeys);
            props.setPartitionKey(partitionKey);
            numPartitionKeys++;
        }

        @Override
        public void configure(JsonNode json) throws IOException {

        }

        @Override
        public Tuple next() throws IOException, InterruptedException {
            if (isFirstTuple) {
                isFirstTuple = false;

                if (firstTuple == null)
                    return null;

                numTuples++;
                return firstTuple;
            }

            Tuple tuple = input.next();
            if (tuple != null) {
                numTuples++;
                return tuple;
            }

            if (costFunction.isCostExceeded(numTuples, numPartitionKeys, 0))
                return null;

            if (!input.advancePivot())
                return null;

            numPartitionKeys++;
            return next();
        }

        @Override
        public void rewind() throws IOException {

        }

        @Override
        public BlockProperties getProperties() {
            return props;
        }

    }

    private static class ByIndexBlock extends StreamingBlock {
        private Index index;
        private int[] colIndex;
        private Tuple outputTuple;

        ByIndexBlock(PivotedBlock input, String[] partitionKeys, CostFunction costFunction)
                throws IOException, InterruptedException {
            super(input, partitionKeys, costFunction);

            BlockSchema inputSchema = input.getProperties().getSchema();
            BlockSchema outputSchema = inputSchema.getComplementSubset(new String[] { "BLOCK_ID" });

            colIndex = new int[outputSchema.getNumColumns()];
            for (int i = 0; i < colIndex.length; i++)
                colIndex[i] = inputSchema.getIndex(outputSchema.getName(i));

            outputTuple = TupleFactory.getInstance().newTuple(colIndex.length);

            props = new BlockProperties("", outputSchema, input.getProperties());
            props.setBlockId(-1);
        }

        @Override
        public void configure(JsonNode json) throws IOException {
            String indexName = JsonUtils.getText(json, "index");
            try {
                index = FileCache.getCachedIndex(indexName);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException(e);
            }

            if (firstTuple != null) {
                int blockIdIndex = input.getProperties().getSchema().getIndex("BLOCK_ID");
                configure((Long) firstTuple.get(blockIdIndex));
            }
        }

        @Override
        public Tuple next() throws IOException, InterruptedException {
            Tuple out = super.next();
            if (out == null)
                return out;

            for (int i = 0; i < colIndex.length; i++)
                outputTuple.set(i, out.get(colIndex[i]));

            return outputTuple;
        }

        /**
         * This configure method is required in cases where empty blocks need to be
         * created based on their corresponding counterparts in the parent relation.
         * 
         * @param blockId
         *            block ID to be set
         */
        public void configure(long blockId) {
            Tuple partitionKey = index.getEntry(blockId).getKey();
            props.setPartitionKey(partitionKey);
            props.setBlockId(blockId);
        }

    }

    private JsonNode inputJson;
    private PivotedBlock inputBlock;
    private BlockgenType blockgenType;
    private long blockgenValue;
    private String[] partitionKeys;
    private String[] sortKeys;

    private final Set<Long> relevantBlockIds = new HashSet<Long>();
    private Tuple blockIdTuple = TupleFactory.getInstance().newTuple(1);
    private int reducerId = -1;
    private int nReducers = -1;
    boolean firstBlock = true;

    @Override
    public void setInput(Configuration conf, Map<String, Block> input, JsonNode json)
            throws IOException, InterruptedException {
        if (input.size() == 0)
            throw new IllegalArgumentException("No input block is provided");

        if (input.size() != 1)
            throw new IllegalArgumentException(
                    "This operator operates on only one input block. (" + input.size() + " provided)");

        this.inputJson = json;
        Block block = input.values().iterator().next();

        if (block == null)
            throw new IllegalArgumentException(
                    "The specified block for [" + input.keySet().iterator().next() + "] is null");

        blockgenType = BlockgenType.valueOf(JsonUtils.getText(json, "blockgenType").toUpperCase());
        if (json.has("blockgenValue")) {
            blockgenValue = json.get("blockgenValue").getLongValue();
        }
        partitionKeys = JsonUtils.asArray(json.get("partitionKeys"));
        // if (json.get("originalPartitionKeys") != null)
        // originalPartitionKeys = JsonUtils.asArray(json.get("originalPartitionKeys"));

        if (json.has("pivotKeys"))
            sortKeys = JsonUtils.asArray(json, "pivotKeys");

        // if sort keys are prefix of partitionkeys. or vice versa, there is no need to
        // sort the block
        if (CommonUtils.isPrefix(partitionKeys, sortKeys) || CommonUtils.isPrefix(sortKeys, partitionKeys))
            sortKeys = null;

        inputBlock = new PivotedBlock(block, partitionKeys);

        if (blockgenType == BlockgenType.BY_INDEX) {
            if (PhaseContext.isMapper()) {
                throw new RuntimeException("Expecting Reduce Context while performing LOAD BLOCK");
            }
            nReducers = conf.getInt("mapred.reduce.tasks", -1);
            if (nReducers < 0)
                throw new RuntimeException("Unable to determine number of reducers.");
            reducerId = PhaseContext.getRedContext().getTaskAttemptID().getTaskID().getId();
            retrieveRelevantBlockIds(json);
        }
    }

    private void retrieveRelevantBlockIds(JsonNode json) throws IOException {
        final Index index;
        try {
            String indexName = JsonUtils.getText(json, "index");
            index = FileCache.getCachedIndex(indexName);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }

        for (Long blockId : index.getAllBlockIds()) {
            if (isRelevant(blockId)) {
                relevantBlockIds.add(blockId);
            }
        }
    }

    /**
     * The relevance check is based on the fact that INDEX based join happens by shuffling
     * data on the BLOCK_ID.
     * 
     * @param blockId
     *            blockId
     * @return if the block is relevant to the current reducer
     */
    private boolean isRelevant(Long blockId) {
        try {
            /* Reusing the blockIdTuple object */
            blockIdTuple.set(0, blockId);
        } catch (ExecException e) {
            e.printStackTrace();
        }

        /* Is this reducer supposed to see this blockId */
        final int hashCode = BlockUtils.getBlockId(blockIdTuple);
        return hashCode % nReducers == reducerId;
    }

    @Override
    public Block next() throws IOException, InterruptedException {
        if (!firstBlock && !inputBlock.advancePivot()) {
            /*
             * For blockgen by INDEX cases. Create empty blocks for all block IDs relevant
             * to this reducer
             */
            if (blockgenType == BlockgenType.BY_INDEX && relevantBlockIds.size() > 0) {
                final Long nextBlockId = relevantBlockIds.iterator().next();

                final ByIndexBlock nextBlock = (ByIndexBlock) createBlock();
                nextBlock.configure(inputJson);
                nextBlock.configure(nextBlockId);
                relevantBlockIds.remove(nextBlockId);

                return nextBlock;
            }
            return null;
        }

        final Block nextBlock = createBlock();
        nextBlock.configure(inputJson);
        /**
         * The following if check applies to a Block GEN by INDEX case where a reducer
         * doesn't receive any data. While there are relevant block ids for which it is
         * supposed to generate empty blocks. In this case First Block is true and
         * therefore the configure(nextBlockId) doesn't happen.
         * 
         * For ByIndexBlock Only The configure method sets the blockId and partitionKey
         * (using the index)
         */
        if (firstBlock && relevantBlockIds.size() > 0 && nextBlock instanceof ByIndexBlock
                && ((ByIndexBlock) nextBlock).firstTuple == null) {
            final Long nextBlockId = relevantBlockIds.iterator().next();
            ((ByIndexBlock) nextBlock).configure(nextBlockId);
        }
        relevantBlockIds.remove(nextBlock.getProperties().getBlockId());

        if (firstBlock)
            firstBlock = false;

        return nextBlock;
    }

    private Block createBlock() throws IOException, InterruptedException {
        CostFunction costFunction = new CostFunction(blockgenValue, blockgenType);

        switch (blockgenType) {
        case BY_INDEX:
            return new ByIndexBlock(inputBlock, partitionKeys, costFunction);
        case BY_PARTITION_KEY:
            if (sortKeys == null)
                return new StreamingBlock(inputBlock, partitionKeys, costFunction);
            else
                return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
        case BY_ROW:
            if (sortKeys == null)
                return new StreamingBlock(inputBlock, partitionKeys, costFunction);
            else
                return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
        case BY_BLOCK_ID:
            return new ByIndexBlock(inputBlock, partitionKeys, costFunction);
        case BY_SIZE:
            return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
        case USER_DEFINED:
            break;
        default:
            break;
        }

        return null;
    }

    @Override
    public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json)
            throws PreconditionException {
        // TODO Auto-generated method stub
        return null;
    }
}