com.scaleoutsoftware.soss.hserver.GridInputFormat.java Source code

Introduction

Here is the source code for com.scaleoutsoftware.soss.hserver.GridInputFormat.java
Source

/*
 Copyright (c) 2015 by ScaleOut Software, Inc.
    
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    
 http://www.apache.org/licenses/LICENSE-2.0
    
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
*/
package com.scaleoutsoftware.soss.hserver;

import com.scaleoutsoftware.soss.client.da.StateServerException;
import com.scaleoutsoftware.soss.hserver.interop.BucketId;
import com.scaleoutsoftware.soss.hserver.interop.BucketStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;

import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Abstract base class for the input formats that read the data stored in the in-memory
 * grid. The data can be stored in the NamedMap or NamedCache.
 *
 * @param <K> key type
 * @param <V> value type
 */
abstract class GridInputFormat<K, V> extends InputFormat<K, V> {
    protected static final String inputAppIdProperty = "mapred.hserver.input.appId";
    private static final String inputNumberOfSplitsProperty = "mapred.hserver.input.numsplits";

    private static final int DEFAULT_NUMBER_OF_SPLITS = 5;
    private static final int HSERVER_JOB_DEFAULT_NUMBER_OF_SPLITS = 1024;

    /**
     * Sets the desired number of input splits. If the number of splits is not set through
     * this method it will default to the number of available slots in the cluster.
     *
     * @param job            job to modify
     * @param numberOfSplits desired number of split.
     */
    public static void setSuggestedNumberOfSplits(Job job, int numberOfSplits) {
        if (numberOfSplits < 1) {
            throw new IllegalArgumentException("Number of splits should be greater than 0.");
        }
        job.getConfiguration().setInt(inputNumberOfSplitsProperty, numberOfSplits);
    }

    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        int appId = jobContext.getConfiguration().getInt(inputAppIdProperty, 0);
        int suggestedNumberOfSplits = getSuggestedNumberOfSplits(jobContext);
        return getSplits(appId, suggestedNumberOfSplits);
    }

    static List<InputSplit> getSplits(int appId, int suggestedNumberOfSplits)
            throws IOException, InterruptedException {
        if (appId == 0) {
            throw new IOException("Input map or cache is not specified.");
        }
        List<BucketId> bucketIds;
        bucketIds = BucketStore.bucketizeNamedCache(appId);

        //Sort buckets by their locations
        Map<InetAddress, List<WritableBucketId>> bucketsByHost = new HashMap<InetAddress, List<WritableBucketId>>();
        WritableBucketId writableId;

        for (BucketId id : bucketIds) {
            writableId = WritableBucketId.copy(id);
            InetAddress location;
            try {
                location = BucketStore.getBucketLocation(id);
            } catch (StateServerException e) {
                throw new IOException("Cannot determine bucket location.", e);
            }
            if (!bucketsByHost.containsKey(location)) {
                bucketsByHost.put(location, new ArrayList<WritableBucketId>());
            }
            bucketsByHost.get(location).add(writableId);
        }

        int numberOfHosts = bucketsByHost.size();

        List<InputSplit> splits = new ArrayList<InputSplit>(suggestedNumberOfSplits);

        int numberOfSplitsPerLocation = Math.max(Math.round(suggestedNumberOfSplits / numberOfHosts), 1); //We want to have at least one split per location to achieve reasonable parallelism.
        for (InetAddress location : bucketsByHost.keySet()) { //We go through hosts, distributing each host buckets into several splits
            List<WritableBucketId> bucketIdsForThisHost = bucketsByHost.get(location);
            int numberOfBucketsForThisHost = bucketIdsForThisHost.size();
            int bucketsPerSplit = Math
                    .max(Math.round((float) numberOfBucketsForThisHost / (float) numberOfSplitsPerLocation), 1);

            List<WritableBucketId> bucketIdsForSplit = new ArrayList<WritableBucketId>(bucketsPerSplit);
            int bucketsInTheSplitCounter = 0;
            for (WritableBucketId id : bucketIdsForThisHost) {
                bucketIdsForSplit.add(id);
                bucketsInTheSplitCounter++;
                if (bucketsInTheSplitCounter == bucketsPerSplit) {
                    //Add completed split
                    splits.add(new BucketSplit(bucketIdsForSplit, location));
                    //Start a new split
                    bucketsInTheSplitCounter = 0;
                    bucketIdsForSplit = new ArrayList<WritableBucketId>(bucketsPerSplit);
                }
            }
            if (bucketIdsForSplit.size() != 0) { //Add the remaining buckets
                splits.add(new BucketSplit(bucketIdsForSplit, location));
            }

        }

        //For sanity check, count the number of bucket ids in the split
        int totalNumberOfBucketsInSplits = 0;
        for (InputSplit split : splits) {
            totalNumberOfBucketsInSplits += ((BucketSplit) split).getBucketIds().size();
        }

        if (totalNumberOfBucketsInSplits != bucketIds.size()) //Sanity check
        {
            throw new RuntimeException("Error while calculating splits. Splits contain = "
                    + totalNumberOfBucketsInSplits + ", total buckets = " + bucketIds.size());
        }

        return splits;
    }

    /**
     * Gets the number of input splits. First, tries the corresponding property,
     * then falls back to the number of available slots.
     *
     * @param context job context
     * @return number of input splits
     */
    private int getSuggestedNumberOfSplits(JobContext context) throws IOException {
        int numberOfSplits;
        Configuration conf = context.getConfiguration();
        numberOfSplits = conf.getInt(inputNumberOfSplitsProperty, -1);
        if (numberOfSplits > 0)
            return numberOfSplits;
        if (HServerParameters.isHServerJob(context.getConfiguration())) { //We are running a hServer job, not a Hadoop job
            return HSERVER_JOB_DEFAULT_NUMBER_OF_SPLITS;
        }
        try {
            ClusterStatus status = (new JobClient((JobConf) context.getConfiguration())).getClusterStatus();
            numberOfSplits = status.getMaxMapTasks() - status.getMapTasks();
            if (numberOfSplits > 0)
                return numberOfSplits;
        } catch (Throwable t) {
            //Do nothing, will fall back to default;
        }
        return DEFAULT_NUMBER_OF_SPLITS;
    }

}