com.google.appengine.tools.mapreduce.RangeInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.google.appengine.tools.mapreduce.RangeInputFormat.java

Source

/*
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.appengine.tools.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * An input format that produces integer keys in a given range.
 *
 * The range is specified to be from the value of
 * {@code RangeInputFormat.RANGE_START_KEY} inclusive to the value of
 * {@code RangeInputFormat.RANGE_END_KEY} exclusive.
 *
 * The number of shards to use is specified by the value of
 * {@code RangeInputFormat.SHARD_COUNT_KEY}
 *
 *
 */
public class RangeInputFormat extends InputFormat<Long, NullWritable> {
    /**
     * Key for storing the shard count in a {@code Configuration}
     */
    public static final String SHARD_COUNT_KEY = "mapreduce.mapper.shardcount";

    /**
     * Default number of input shards.
     */
    public static final int DEFAULT_SHARD_COUNT = 4;

    /**
     * Key for inclusive start value for the range to process.
     */
    public static final String RANGE_START_KEY = "mapreduce.mapper.inputformat.rangeinputformat.range_start";

    /**
     * Key for exclusive end value for the range to process.
     */
    public static final String RANGE_END_KEY = "mapreduce.mapper.inputformat.rangeinputformat.range_end";

    @Override
    public RecordReader<Long, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new RangeRecordReader();
    }

    private long getNonNegativeLong(Configuration conf, String key) throws IOException {
        long retVal = conf.getLong(key, -1L);
        if (retVal < 0) {
            throw new InvalidConfigurationException("Invalid or nonexistent value for " + key);
        }
        return retVal;
    }

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        long shardCount = conf.getInt(SHARD_COUNT_KEY, DEFAULT_SHARD_COUNT);
        long rangeStart = getNonNegativeLong(conf, RANGE_START_KEY);
        long rangeEnd = getNonNegativeLong(conf, RANGE_END_KEY);
        if (rangeStart >= rangeEnd) {
            throw new InvalidConfigurationException("Invalid range. Start: " + rangeStart + " >= end: " + rangeEnd);
        }

        double increment = ((double) rangeEnd - rangeStart) / shardCount;
        ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
        for (int i = 0; i < shardCount - 1; i++) {
            splits.add(new RangeInputSplit(rangeStart + Math.round(i * increment),
                    rangeStart + Math.round((i + 1) * increment)));
        }

        // Make sure that the final split hits end
        splits.add(new RangeInputSplit(rangeStart + Math.round((shardCount - 1) * increment), rangeEnd));

        return splits;
    }
}