org.apache.flink.runtime.jobmanager.splitassigner.file.FileInputSplitList.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.runtime.jobmanager.splitassigner.file.FileInputSplitList.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.jobmanager.splitassigner.file;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.runtime.instance.Instance;

/**
 * The file input split list stores the file input splits for an input vertex that are still expected to be consumed.
 * Besides simply storing the splits, the file input split list also computes the distance all {@link org.apache.flink.runtime.instance.Instance}
 * objects which request a input split and its nearest storage location with respect to the underlying network topology.
 * That way input splits are always given to consuming vertices in a way that data locality is preserved as well as
 * possible.
 * <p>
 * This class is not thread-safe.
 * 
 */
public final class FileInputSplitList {

    /**
     * The logging object which is used to report information and errors.
     */
    private static final Log LOG = LogFactory.getLog(FileInputSplitList.class);

    /**
     * The set containing all the file input splits that still must be consumed.
     */
    private Set<FileInputSplit> masterSet = new HashSet<FileInputSplit>();

    /**
     * The map caching the specific file input split lists for each {@link org.apache.flink.runtime.instance.Instance}.
     */
    private Map<Instance, Queue<QueueElem>> instanceMap = new HashMap<Instance, Queue<QueueElem>>();

    /**
     * This is an auxiliary class to store the minimum distance between a file input split's storage locations and an
     * {@link org.apache.flink.runtime.instance.Instance}.
     * 
     */
    private final class QueueElem implements Comparable<QueueElem> {

        /**
         * The file input split the distance applies to.
         */
        final FileInputSplit inputSplit;

        /**
         * The minimum distance between the file input split's storage locations and the instance this object has been
         * created for.
         */
        final int distance;

        /**
         * Creates a new queue element.
         * 
         * @param inputSplit
         *        the file input split to be stored
         * @param distance
         *        the minimum distance between the stored input split's storage locations and the instance this object
         *        has been created for
         */
        private QueueElem(final FileInputSplit inputSplit, final int distance) {
            this.inputSplit = inputSplit;
            this.distance = distance;
        }

        /**
         * Returns the file input split stored within this object.
         * 
         * @return the file input split
         */
        private FileInputSplit getInputSplit() {
            return this.inputSplit;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public int compareTo(final QueueElem o) {

            return (this.distance - o.distance);
        }

    }

    /**
     * Adds the given file input split to the set of file input splits to be consumed.
     * 
     * @param fileInputSplit
     *        the file input split to be added
     */
    synchronized void addSplit(final FileInputSplit fileInputSplit) {

        this.masterSet.add(fileInputSplit);
    }

    /**
     * Returns the next file input split to be consumed by the given instance. The returned input split is selected in a
     * way that the distance between the split's storage location and the requesting {@link org.apache.flink.runtime.instance.Instance} is as
     * short as possible.
     * 
     * @param instance
     *        the instance requesting the next file input split
     * @return the next input split to be consumed by the given instance or <code>null</code> if all input splits have
     *         already been consumed.
     */
    synchronized FileInputSplit getNextInputSplit(final Instance instance) {

        final Queue<QueueElem> instanceSplitList = getInstanceSplitList(instance);

        while (true) {

            final QueueElem candidate = instanceSplitList.poll();
            if (candidate == null) {
                return null;
            }

            if (this.masterSet.remove(candidate.getInputSplit())) {
                if (LOG.isInfoEnabled()) {
                    if (candidate.distance == 0) {
                        LOG.info(instance + " receives local file input split");
                    } else {
                        LOG.info(instance + " receives remote file input split (distance " + candidate.distance
                                + ")");
                    }
                }
                return candidate.getInputSplit();
            }

            if (this.masterSet.isEmpty()) {
                return null;
            }
        }
    }

    /**
     * Returns a list of file input splits specifically ordered for the given {@link org.apache.flink.runtime.instance.Instance}. When the list is
     * initially created, it contains all the unconsumed file input splits at that point in time, ascendingly ordered by
     * the minimum distance between the input splits' storage locations and the given {@link org.apache.flink.runtime.instance.Instance}.
     * 
     * @param instance
     *        the instance for which the file input split list has been computed
     * @return the list of file input splits ordered specifically for the given instance
     */
    private Queue<QueueElem> getInstanceSplitList(final Instance instance) {

        Queue<QueueElem> instanceSplitList = this.instanceMap.get(instance);
        if (instanceSplitList == null) {

            // Create and populate instance specific split list
            instanceSplitList = new PriorityQueue<FileInputSplitList.QueueElem>();
            final Iterator<FileInputSplit> it = this.masterSet.iterator();
            while (it.hasNext()) {

                final FileInputSplit split = it.next();
                final String[] hostNames = split.getHostNames();
                if (hostNames == null) {
                    instanceSplitList.add(new QueueElem(split, Integer.MAX_VALUE));

                } else {

                    int minDistance = Integer.MAX_VALUE;
                    for (int i = 0; i < hostNames.length; ++i) {
                        final int distance = instance.getDistance(hostNames[i]);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Distance between " + instance + " and " + hostNames[i] + " is " + distance);
                        }
                        if (distance < minDistance) {
                            minDistance = distance;
                        }
                    }

                    instanceSplitList.add(new QueueElem(split, minDistance));
                }
            }

            this.instanceMap.put(instance, instanceSplitList);
        }

        return instanceSplitList;
    }
}