eu.stratosphere.nephele.jobmanager.splitassigner.file.FileInputSplitList.java Source code

Java tutorial

Introduction

Here is the source code for eu.stratosphere.nephele.jobmanager.splitassigner.file.FileInputSplitList.java

Source

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/

package eu.stratosphere.nephele.jobmanager.splitassigner.file;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.nephele.instance.AbstractInstance;

/**
 * The file input split list stores the file input splits for an input vertex that are still expected to be consumed.
 * Besides simply storing the splits, the file input split list also computes the distance all {@link AbstractInstance}
 * objects which request a input split and its nearest storage location with respect to the underlying network topology.
 * That way input splits are always given to consuming vertices in a way that data locality is preserved as well as
 * possible.
 * <p>
 * This class is not thread-safe.
 * 
 */
public final class FileInputSplitList {

    /**
     * The logging object which is used to report information and errors.
     */
    private static final Log LOG = LogFactory.getLog(FileInputSplitList.class);

    /**
     * The set containing all the file input splits that still must be consumed.
     */
    private Set<FileInputSplit> masterSet = new HashSet<FileInputSplit>();

    /**
     * The map caching the specific file input split lists for each {@link AbstractInstance}.
     */
    private Map<AbstractInstance, Queue<QueueElem>> instanceMap = new HashMap<AbstractInstance, Queue<QueueElem>>();

    /**
     * This is an auxiliary class to store the minimum distance between a file input split's storage locations and an
     * {@link AbstractInstance}.
     * 
     */
    private final class QueueElem implements Comparable<QueueElem> {

        /**
         * The file input split the distance applies to.
         */
        final FileInputSplit inputSplit;

        /**
         * The minimum distance between the file input split's storage locations and the instance this object has been
         * created for.
         */
        final int distance;

        /**
         * Creates a new queue element.
         * 
         * @param inputSplit
         *        the file input split to be stored
         * @param distance
         *        the minimum distance between the stored input split's storage locations and the instance this object
         *        has been created for
         */
        private QueueElem(final FileInputSplit inputSplit, final int distance) {
            this.inputSplit = inputSplit;
            this.distance = distance;
        }

        /**
         * Returns the file input split stored within this object.
         * 
         * @return the file input split
         */
        private FileInputSplit getInputSplit() {
            return this.inputSplit;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public int compareTo(final QueueElem o) {

            return (this.distance - o.distance);
        }

    }

    /**
     * Adds the given file input split to the set of file input splits to be consumed.
     * 
     * @param fileInputSplit
     *        the file input split to be added
     */
    synchronized void addSplit(final FileInputSplit fileInputSplit) {

        this.masterSet.add(fileInputSplit);
    }

    /**
     * Returns the next file input split to be consumed by the given instance. The returned input split is selected in a
     * way that the distance between the split's storage location and the requesting {@link AbstractInstance} is as
     * short as possible.
     * 
     * @param instance
     *        the instance requesting the next file input split
     * @return the next input split to be consumed by the given instance or <code>null</code> if all input splits have
     *         already been consumed.
     */
    synchronized FileInputSplit getNextInputSplit(final AbstractInstance instance) {

        final Queue<QueueElem> instanceSplitList = getInstanceSplitList(instance);

        while (true) {

            final QueueElem candidate = instanceSplitList.poll();
            if (candidate == null) {
                return null;
            }

            if (this.masterSet.remove(candidate.getInputSplit())) {
                if (LOG.isInfoEnabled()) {
                    if (candidate.distance == 0) {
                        LOG.info(instance + " receives local file input split");
                    } else {
                        LOG.info(instance + " receives remote file input split (distance " + candidate.distance
                                + ")");
                    }
                }
                return candidate.getInputSplit();
            }

            if (this.masterSet.isEmpty()) {
                return null;
            }
        }
    }

    /**
     * Returns a list of file input splits specifically ordered for the given {@link AbstractInstance}. When the list is
     * initially created, it contains all the unconsumed file input splits at that point in time, ascendingly ordered by
     * the minimum distance between the input splits' storage locations and the given {@link AbstractInstance}.
     * 
     * @param instance
     *        the instance for which the file input split list has been computed
     * @return the list of file input splits ordered specifically for the given instance
     */
    private Queue<QueueElem> getInstanceSplitList(final AbstractInstance instance) {

        Queue<QueueElem> instanceSplitList = this.instanceMap.get(instance);
        if (instanceSplitList == null) {

            // Create and populate instance specific split list
            instanceSplitList = new PriorityQueue<FileInputSplitList.QueueElem>();
            final Iterator<FileInputSplit> it = this.masterSet.iterator();
            while (it.hasNext()) {

                final FileInputSplit split = it.next();
                final String[] hostNames = split.getHostNames();
                if (hostNames == null) {
                    instanceSplitList.add(new QueueElem(split, Integer.MAX_VALUE));

                } else {

                    int minDistance = Integer.MAX_VALUE;
                    for (int i = 0; i < hostNames.length; ++i) {
                        final int distance = instance.getDistance(hostNames[i]);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Distance between " + instance + " and " + hostNames[i] + " is " + distance);
                        }
                        if (distance < minDistance) {
                            minDistance = distance;
                        }
                    }

                    instanceSplitList.add(new QueueElem(split, minDistance));
                }
            }

            this.instanceMap.put(instance, instanceSplitList);
        }

        return instanceSplitList;
    }
}