Source code

Java tutorial


Here is the source code for


/* (c) 2014 LinkedIn Corp. All rights reserved.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.


import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import com.linkedin.cubert.plan.physical.CubertStrings;

 * A generic InputFormat that transparently handles multi mappers as well as combined file
 * splits.
 * @author Maneesh Varshney
 * @param <K>
 * @param <V>
public class CubertInputFormat<K, V> extends InputFormat<K, V> {

    private InputFormat<K, V> getActualInputFormat(JobContext context) {
        try {
            InputFormat<K, V> actualInputFormat = (InputFormat<K, V>) context.getInputFormatClass().newInstance();
            if (actualInputFormat instanceof CubertInputFormat)
                throw new RuntimeException("No actual input format specified");

            return actualInputFormat;
        } catch (InstantiationException e) {
            // TODO Auto-generated catch block
        } catch (IllegalAccessException e) {
            // TODO Auto-generated catch block
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
        return null;

    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        ConfigurationDiff confDiff = new ConfigurationDiff(conf);

        int numMultiMappers = confDiff.getNumDiffs();

        List<InputSplit> splits = new ArrayList<InputSplit>();

        for (int mapperIndex = 0; mapperIndex < numMultiMappers; mapperIndex++) {
            // reset conf to multimapper i

            // get the actual input format
            InputFormat<K, V> actualInputFormat = getActualInputFormat(context);

            List<InputSplit> actualSplits = null;

            // check if combined input split is requested
            boolean combineSplit = conf.getBoolean(CubertStrings.COMBINED_INPUT, false);

            if (combineSplit) {
                // Create CombinedFileInputFormat
                CombineFileInputFormat<K, V> cfif = new CombineFileInputFormat<K, V>() {
                    public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
                            throws IOException {
                        throw new IllegalStateException("Should not be called");

                // get the splits
                actualSplits = cfif.getSplits(context);
            } else {
                actualSplits = actualInputFormat.getSplits(context);

            // embed each split in MultiMapperSplit and add to list
            for (InputSplit actualSplit : actualSplits)
                splits.add(new MultiMapperSplit(actualSplit, mapperIndex));

            // undo the diff
        return splits;

    public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        ConfigurationDiff confDiff = new ConfigurationDiff(conf);

        MultiMapperSplit mmSplit = (MultiMapperSplit) split;
        int multiMapperIndex = mmSplit.getMultiMapperIndex();


        // reset the conf to multiMapperIndex
        InputSplit actualSplit = mmSplit.getActualSplit();

        // get the actual input format class
        InputFormat<K, V> actualInputFormat = getActualInputFormat(context);

        RecordReader<K, V> reader = null;

        if (actualSplit instanceof CombineFileSplit) {
            reader = new CombinedFileRecordReader<K, V>(actualInputFormat, (CombineFileSplit) actualSplit, context);
        } else {
            reader = actualInputFormat.createRecordReader(actualSplit, context);

        // confDiff.undoDiff(multiMapperIndex);

        return new MultiMapperRecordReader<K, V>(reader);
