com.willetinc.hadoop.mapreduce.dynamodb.AbstractSplitter.java Source code

Java tutorial

Introduction

Here is the source code for com.willetinc.hadoop.mapreduce.dynamodb.AbstractSplitter.java

Source

/**
 * Copyright 2012 Willet Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.willetinc.hadoop.mapreduce.dynamodb;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;

import com.amazonaws.services.dynamodb.model.AttributeValue;
import com.amazonaws.services.dynamodb.model.ComparisonOperator;

/**
 * <p>
 * Implements Splitter over DynamoDB Number datatype values.
 * </p>
 * <p>
 * 
 * </p>
 */
public abstract class AbstractSplitter implements DynamoDBSplitter {

    private static final Log LOG = LogFactory.getLog(AbstractSplitter.class);

    @Override
    public List<InputSplit> split(Configuration conf) throws IOException {

        // load configuration
        boolean interpolate = DynamoDBQueryInputFormat.getInterpolateAcrossRangeKeyValues(conf);

        Types hashKeyType = DynamoDBQueryInputFormat.getHashKeyType(conf);
        AttributeValue hashKeyValue = DynamoDBQueryInputFormat.getHashKeyValue(conf);

        Types rangeKeyType = DynamoDBQueryInputFormat.getRangeKeyType(conf);
        Collection<AttributeValue> rangeKeyValues = DynamoDBQueryInputFormat.getRangeKeyValues(conf);
        ComparisonOperator rangeKeyoperator = DynamoDBQueryInputFormat.getRangeKeyComparisonOperator(conf);
        AttributeValue minRangeKeyValue = DynamoDBQueryInputFormat.getRangeKeyInterpolateMinValue(conf);
        AttributeValue maxRangeKeyValue = DynamoDBQueryInputFormat.getRangeKeyInterpolateMaxValue(conf);

        // ensure DynamoDBQueryInputFormat was configured correctly
        if (interpolate) {
            rangeKeyValues = new ArrayList<AttributeValue>();
        } else {
            minRangeKeyValue = null;
            maxRangeKeyValue = null;
        }

        // compute number of input splits
        int numSplits = conf.getInt("mapred.map.tasks", 1);
        int numHashKeys = 1;
        int numRangeSplits = numSplits / numHashKeys;
        numRangeSplits = (!interpolate) ? 1 : numRangeSplits;
        numRangeSplits = (numRangeSplits <= 0) ? 1 : numRangeSplits;

        // generate input spits
        List<InputSplit> splits = new ArrayList<InputSplit>();

        // handle cases where interpolation is turned off or unnecessary
        if (!interpolate || numRangeSplits <= 1 || minRangeKeyValue == null || maxRangeKeyValue == null) {
            LOG.info("Generating 1 split for each HashKey");

            DynamoDBQueryInputFormat.DynamoDBQueryInputSplit split = new DynamoDBQueryInputFormat.DynamoDBQueryInputSplit(
                    hashKeyType, hashKeyValue, rangeKeyType, rangeKeyValues, rangeKeyoperator);

            splits.add(split);
        } else {
            // interpolate between RangeKey values
            LOG.info(String.format("Generating %d RangeKey splits for each HashKey", numRangeSplits));

            if (null == hashKeyValue) {
                LOG.error("Cannot create a range when the HashKey is NULL. Ignoring range key interpolation.");
            } else {
                generateRangeKeySplits(conf, splits, hashKeyType, hashKeyValue, rangeKeyType, minRangeKeyValue,
                        maxRangeKeyValue, numRangeSplits);
            }
        }

        return splits;
    }

    abstract void generateRangeKeySplits(Configuration conf, List<InputSplit> splits, Types hashKeyType,
            AttributeValue hashKeyValue, Types rangeKeyType, AttributeValue minRangeKeyValue,
            AttributeValue maxRangeKeyValue, int numRangeSplits);

}