org.apache.hadoop.mapred.lib.CombineFileInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapred.lib.CombineFileInputFormat.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred.lib;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.LinkedList;
import java.util.HashSet;
import java.util.List;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.net.NetworkTopology;

import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RecordReader;
import org.mortbay.util.ajax.JSON;

/**
 * An abstract {@link org.apache.hadoop.mapred.InputFormat} that returns {@link CombineFileSplit}'s
 * in {@link org.apache.hadoop.mapred.InputFormat#getSplits(JobConf, int)} method.
 * Splits are constructed from the files under the input paths.
 * A split cannot have files from different pools.
 * Each split returned may contain blocks from different files.
 * If a maxSplitSize is specified, then blocks on the same node are
 * combined to form a single split. Blocks that are left over are
 * then combined with other blocks in the same rack.
 * If maxSplitSize is not specified, then blocks from the same rack
 * are combined in a single split; no attempt is made to create
 * node-local splits.
 * If the maxSplitSize is equal to the block size, then this class
 * is similar to the default spliting behaviour in Hadoop: each
 * block is a locally processed split.
 * Subclasses implement {@link org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit, JobConf, Reporter)}
 * to construct <code>RecordReader</code>'s for <code>CombineFileSplit</code>'s.
 * @see CombineFileSplit
 */
public abstract class CombineFileInputFormat<K, V> extends FileInputFormat<K, V> {

    // ability to limit the size of a single split
    private long maxSplitSize = 0;
    private long minSplitSizeNode = 0;
    private long minSplitSizeRack = 0;
    private long maxNumBlocksPerSplit = 0;

    // A pool of input paths filters. A split cannot have blocks from files
    // across multiple pools.
    private ArrayList<MultiPathFilter> pools = new ArrayList<MultiPathFilter>();

    // mapping from a rack name to the set of Nodes in the rack
    private HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();

    // Whether to pass only the path component of the URI to the pool filters
    private boolean poolFilterPathOnly = true;

    // Special log for json metrics (all split stats sent here for easier
    // parsing)
    private static final Log JSON_METRICS_LOG = LogFactory.getLog("JsonMetrics");

    // Map of the stat type to actual stats
    private final EnumMap<SplitType, SplitTypeStats> splitTypeStatsMap = new EnumMap<SplitType, SplitTypeStats>(
            SplitType.class);

    // Split statistics types
    private enum SplitType {
        SINGLE_BLOCK_LOCAL, NODE_LOCAL, NODE_LOCAL_LEFTOVER, RACK_LOCAL, RACK_LOCAL_LEFTOVER, OVERFLOW, OVERFLOW_LEFTOVER, ALL
    }

    /** Are the split types stats valid? */
    private boolean isSplitTypeStatsValid = true;

    /**
     * Get whether the type stats are valid.  Used for testing.
     *
     * @return true if the type stats are valid, false otherwise
     */
    public boolean isTypeStatsValid() {
        return isSplitTypeStatsValid;
    }

    /**
     * Stats associated with a split type
     */
    private class SplitTypeStats {
        private int totalSplitCount = 0;
        private long totalSize = 0;
        private long totalBlockCount = 0;
        private long totalHostCount = 0;

        /**
         * Add a split for this type
         * @param splitSize Size of the split
         * @param hostCount Hosts listed for this split
         * @param blockCount Blocks in this split
         */
        public void addSplit(long splitSize, long hostCount, long blockCount) {
            ++totalSplitCount;
            totalSize += splitSize;
            totalBlockCount += blockCount;
            totalHostCount += hostCount;
        }

        public int getTotalSplitCount() {
            return totalSplitCount;
        }

        public long getTotalSize() {
            return totalSize;
        }

        public long getTotalHostCount() {
            return totalHostCount;
        }

        public long getTotalBlockCount() {
            return totalBlockCount;
        }
    }

    /**
     * Add stats for a split type (i.e node local splits,
     * rack local splits, etc.) and keep a total count.
     * @param splitSize Size of the split
     * @param hostCount Hosts listed for this split
     * @param blockCount Blocks in this split
     */
    private void addStatsForSplitType(SplitType splitType, long splitSize, long hostCount, long blockCount) {
        SplitTypeStats splitTypeStats = splitTypeStatsMap.get(splitType);
        if (splitTypeStats == null) {
            splitTypeStats = new SplitTypeStats();
            splitTypeStatsMap.put(splitType, splitTypeStats);
        }
        splitTypeStats.addSplit(splitSize, hostCount, blockCount);

        // Add all splits to the ALL split type
        if (splitType != SplitType.ALL) {
            addStatsForSplitType(SplitType.ALL, splitSize, hostCount, blockCount);
        }
    }

    /**
     * Get stats for every split type as a string
     * @return String of all split type stats
     */
    private String getStatsString() {
        SplitTypeStats allTypeStats = splitTypeStatsMap.get(SplitType.ALL);
        Map<String, Map<String, Number>> statsMapMap = new HashMap<String, Map<String, Number>>();
        for (Map.Entry<SplitType, SplitTypeStats> entry : splitTypeStatsMap.entrySet()) {
            Map<String, Number> statsMap = new HashMap<String, Number>();
            statsMapMap.put(entry.getKey().toString(), statsMap);

            float percentTotalSplitCount = (100f * entry.getValue().getTotalSplitCount())
                    / allTypeStats.getTotalSplitCount();
            float percentTotalSize = (100f * entry.getValue().getTotalSize()) / allTypeStats.getTotalSize();
            float percentTotalBlockCount = (100f * entry.getValue().getTotalBlockCount())
                    / allTypeStats.getTotalBlockCount();
            float averageSizePerSplit = ((float) entry.getValue().getTotalSize())
                    / entry.getValue().getTotalSplitCount();
            float averageHostCountPerSplit = ((float) entry.getValue().getTotalHostCount())
                    / entry.getValue().getTotalSplitCount();
            float averageBlockCountPerSplit = ((float) entry.getValue().getTotalBlockCount())
                    / entry.getValue().getTotalSplitCount();
            statsMap.put("totalSplitCount", entry.getValue().getTotalSplitCount());
            statsMap.put("percentTotalSplitCount", percentTotalSplitCount);
            statsMap.put("totalSize", entry.getValue().getTotalSize());
            statsMap.put("percentTotalSize", percentTotalSize);
            statsMap.put("averageSizePerSplit", averageSizePerSplit);
            statsMap.put("totalHostCount", entry.getValue().getTotalHostCount());
            statsMap.put("averageHostCountPerSplit", averageHostCountPerSplit);
            statsMap.put("totalBlockCount", entry.getValue().getTotalBlockCount());
            statsMap.put("percentTotalBlockCount", percentTotalBlockCount);
            statsMap.put("averageBlockCountPerSplit", averageBlockCountPerSplit);
        }
        return JSON.toString(statsMapMap);
    }

    /**
     * Specify the maximum size (in bytes) of each split. Each split is
     * approximately equal to the specified size.
     */
    protected void setMaxSplitSize(long maxSplitSize) {
        this.maxSplitSize = maxSplitSize;
    }

    /**
     * Specify the maximum number of blocks in each split.
     */
    protected void setMaxNumBlocksPerSplit(long maxNumBlocksPerSplit) {
        this.maxNumBlocksPerSplit = maxNumBlocksPerSplit;
    }

    /**
     * Specify the minimum size (in bytes) of each split per node.
     * This applies to data that is left over after combining data on a single
     * node into splits that are of maximum size specified by maxSplitSize.
     * This leftover data will be combined into its own split if its size
     * exceeds minSplitSizeNode.
     */
    protected void setMinSplitSizeNode(long minSplitSizeNode) {
        this.minSplitSizeNode = minSplitSizeNode;
    }

    /**
     * Specify the minimum size (in bytes) of each split per rack.
     * This applies to data that is left over after combining data on a single
     * rack into splits that are of maximum size specified by maxSplitSize.
     * This leftover data will be combined into its own split if its size
     * exceeds minSplitSizeRack.
     */
    protected void setMinSplitSizeRack(long minSplitSizeRack) {
        this.minSplitSizeRack = minSplitSizeRack;
    }

    /**
     * Create a new pool and add the filters to it.
     * A split cannot have files from different pools.
     */
    protected void createPool(JobConf conf, List<PathFilter> filters) {
        pools.add(new MultiPathFilter(filters));
    }

    /**
     * Create a new pool and add the filters to it.
     * A pathname can satisfy any one of the specified filters.
     * A split cannot have files from different pools.
     */
    protected void createPool(JobConf conf, PathFilter... filters) {
        MultiPathFilter multi = new MultiPathFilter();
        for (PathFilter f : filters) {
            multi.add(f);
        }
        pools.add(multi);
    }

    private CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(new JobConf());

    @Override
    protected boolean isSplitable(FileSystem ignored, Path file) {
        return compressionCodecs.getCodec(file) == null;
    }

    /**
     * default constructor
     */
    public CombineFileInputFormat() {
        // Add the all stats, in case of no splits
        splitTypeStatsMap.put(SplitType.ALL, new SplitTypeStats());
    }

    /**
     *
     * @param pathOnly If true, pass only the path component of input paths (i.e.
     * strip out the scheme and authority) to the pool filters
     */
    protected void setPoolFilterPathOnly(boolean pathOnly) {
        poolFilterPathOnly = pathOnly;
    }

    protected boolean getPoolFilterPathOnly() {
        return poolFilterPathOnly;
    }

    @Override
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        long minSizeNode = 0;
        long minSizeRack = 0;
        long maxSize = 0;
        long maxNumBlocks = 0;

        // the values specified by setxxxSplitSize() takes precedence over the
        // values that might have been specified in the config
        if (minSplitSizeNode != 0) {
            minSizeNode = minSplitSizeNode;
        } else {
            minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
        }
        if (maxNumBlocksPerSplit != 0) {
            maxNumBlocks = maxNumBlocksPerSplit;
        } else {
            maxNumBlocks = job.getLong("mapred.max.num.blocks.per.split", 0);
        }
        if (minSplitSizeRack != 0) {
            minSizeRack = minSplitSizeRack;
        } else {
            minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
        }
        if (maxSplitSize != 0) {
            maxSize = maxSplitSize;
        } else {
            maxSize = job.getLong("mapred.max.split.size", 0);
        }
        if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
            throw new IOException("Minimum split size pernode " + minSizeNode
                    + " cannot be larger than maximum split size " + maxSize);
        }
        if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
            throw new IOException("Minimum split size per rack " + minSizeRack
                    + " cannot be larger than maximum split size " + maxSize);
        }
        if (minSizeRack != 0 && minSizeNode > minSizeRack) {
            throw new IOException("Minimum split size per rack " + minSizeRack
                    + " cannot be smaller than minimum split size per node " + minSizeNode);
        }

        // all the files in input set
        LocatedFileStatus[] stats = listLocatedStatus(job);
        long totalLen = 0;
        for (LocatedFileStatus stat : stats) {
            totalLen += stat.getLen();
        }
        List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
        if (stats.length == 0) {
            return splits.toArray(new CombineFileSplit[splits.size()]);
        }

        // Put them into a list for easier removal during iteration
        Collection<LocatedFileStatus> newstats = new LinkedList<LocatedFileStatus>();
        Collections.addAll(newstats, stats);
        stats = null;

        // In one single iteration, process all the paths in a single pool.
        // Processing one pool at a time ensures that a split contains paths
        // from a single pool only.
        for (MultiPathFilter onepool : pools) {
            ArrayList<LocatedFileStatus> myStats = new ArrayList<LocatedFileStatus>();

            // pick one input path. If it matches all the filters in a pool,
            // add it to the output set
            for (Iterator<LocatedFileStatus> iter = newstats.iterator(); iter.hasNext();) {
                LocatedFileStatus stat = iter.next();
                if (onepool.accept(stat.getPath(), poolFilterPathOnly)) {
                    myStats.add(stat); // add it to my output set
                    iter.remove();
                }
            }
            // create splits for all files in this pool.
            getMoreSplits(job, myStats, maxSize, minSizeNode, minSizeRack, maxNumBlocks, splits);
        }

        // create splits for all files that are not in any pool.
        getMoreSplits(job, newstats, maxSize, minSizeNode, minSizeRack, maxNumBlocks, splits);

        // free up rackToNodes map
        rackToNodes.clear();
        verifySplits(job, totalLen, splits);

        // Print the stats of the splits to the special json metrics log for easier
        // parsing.  Also, clean up the stats after each getSplits() call since
        // others may call it multiple times (i.e. CombineHiveInputFormat)
        JSON_METRICS_LOG.info(getStatsString());
        splitTypeStatsMap.clear();

        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    private void verifySplits(JobConf conf, long totalLen, List<CombineFileSplit> splits) throws IOException {
        if (!conf.getBoolean("mapred.fileinputformat.verifysplits", true)) {
            return;
        }
        long totalSplitLen = 0;
        for (CombineFileSplit split : splits) {
            totalSplitLen += split.getLength();
        }

        if (totalLen != totalSplitLen) {
            throw new IOException(
                    "Total length expected is " + totalLen + ", but total split length is " + totalSplitLen);
        }

        if (splitTypeStatsMap.get(SplitType.ALL).getTotalSize() != totalSplitLen) {
            LOG.error("Total length expected is " + totalLen + ", but total split length according to stats is "
                    + splitTypeStatsMap.get(SplitType.ALL).getTotalSize() + ", previous isSplitTypeStatsValid = "
                    + isSplitTypeStatsValid);
            isSplitTypeStatsValid = false;
        }
    }

    /**
     * Comparator to be used with sortBlocksBySize to sort from largest to
     * smallest.
     */
    private class OneBlockInfoSizeComparator implements Comparator<OneBlockInfo> {
        @Override
        public int compare(OneBlockInfo left, OneBlockInfo right) {
            return (int) (right.length - left.length);
        }
    }

    /**
     * Sort the blocks on each node by size, largest to smallest
     *
     * @param nodeToBlocks Map of nodes to all blocks on that node
     */
    private void sortBlocksBySize(Map<String, List<OneBlockInfo>> nodeToBlocks) {
        OneBlockInfoSizeComparator comparator = new OneBlockInfoSizeComparator();
        for (Entry<String, List<OneBlockInfo>> entry : nodeToBlocks.entrySet()) {
            Collections.sort(entry.getValue(), comparator);
        }
    }

    /**
     * Return all the splits in the specified set of paths
     */
    private void getMoreSplits(JobConf job, Collection<LocatedFileStatus> stats, long maxSize, long minSizeNode,
            long minSizeRack, long maxNumBlocksPerSplit, List<CombineFileSplit> splits) throws IOException {

        // all blocks for all the files in input set
        OneFileInfo[] files;

        // mapping from a rack name to the list of blocks it has
        HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

        // mapping from a block to the nodes on which it has replicas
        HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

        // mapping from a node to the list of blocks that it contains
        HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

        if (stats.isEmpty()) {
            return;
        }
        files = new OneFileInfo[stats.size()];

        // populate all the blocks for all files
        long totLength = 0;
        int fileIndex = 0;
        for (LocatedFileStatus oneStatus : stats) {
            files[fileIndex] = new OneFileInfo(oneStatus, job,
                    isSplitable(FileSystem.get(job), oneStatus.getPath()), rackToBlocks, blockToNodes, nodeToBlocks,
                    rackToNodes, maxSize);
            totLength += files[fileIndex].getLength();
            fileIndex++;
        }

        // Sort the blocks on each node from biggest to smallest by size to
        // encourage more node-local single block splits
        sortBlocksBySize(nodeToBlocks);

        ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
        Set<String> nodes = new HashSet<String>();
        long curSplitSize = 0;

        // process all nodes and create splits that are local
        // to a node.
        for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
                .hasNext();) {

            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
            nodes.add(one.getKey());
            List<OneBlockInfo> blocksInNode = one.getValue();

            // for each block, copy it into validBlocks. Delete it from
            // blockToNodes so that the same block does not appear in
            // two different splits.
            for (OneBlockInfo oneblock : blocksInNode) {
                if (blockToNodes.containsKey(oneblock)) {
                    validBlocks.add(oneblock);
                    blockToNodes.remove(oneblock);
                    curSplitSize += oneblock.length;

                    // if the accumulated split size exceeds the maximum, then
                    // create this split.
                    if ((maxSize != 0 && curSplitSize >= maxSize)
                            || (maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
                        // create an input split and add it to the splits array
                        // if only one block, add all the node replicas
                        if (validBlocks.size() == 1) {
                            Set<String> blockLocalNodes = new HashSet<String>(
                                    Arrays.asList(validBlocks.get(0).hosts));
                            addCreatedSplit(job, splits, blockLocalNodes, validBlocks);
                            addStatsForSplitType(SplitType.SINGLE_BLOCK_LOCAL, curSplitSize, blockLocalNodes.size(),
                                    validBlocks.size());
                        } else {
                            addCreatedSplit(job, splits, nodes, validBlocks);
                            addStatsForSplitType(SplitType.NODE_LOCAL, curSplitSize, nodes.size(),
                                    validBlocks.size());
                        }
                        curSplitSize = 0;
                        validBlocks.clear();
                    }
                }
            }
            // if there were any blocks left over and their combined size is
            // larger than minSplitNode, then combine them into one split.
            // Otherwise add them back to the unprocessed pool. It is likely
            // that they will be combined with other blocks from the same rack later on.
            if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
                // create an input split and add it to the splits array
                addCreatedSplit(job, splits, nodes, validBlocks);
                addStatsForSplitType(SplitType.NODE_LOCAL_LEFTOVER, curSplitSize, nodes.size(), validBlocks.size());
            } else {
                for (OneBlockInfo oneblock : validBlocks) {
                    blockToNodes.put(oneblock, oneblock.hosts);
                }
            }
            validBlocks.clear();
            nodes.clear();
            curSplitSize = 0;
        }

        // if blocks in a rack are below the specified minimum size, then keep them
        // in 'overflow'. After the processing of all racks is complete, these overflow
        // blocks will be combined into splits.
        ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
        Set<String> racks = new HashSet<String>();

        // Process all racks over and over again until there is no more work to do.
        boolean noRacksMadeSplit = false;
        while (blockToNodes.size() > 0) {

            // Create one split for this rack before moving over to the next rack.
            // Come back to this rack after creating a single split for each of the
            // remaining racks.
            // Process one rack location at a time, Combine all possible blocks that
            // reside on this rack as one split. (constrained by minimum and maximum
            // split size).

            // Iterate over all racks.  Add to the overflow blocks only if at least
            // one pass over all the racks was completed without adding any splits
            long splitsAddedOnAllRacks = 0;
            for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
                    .hasNext();) {

                Map.Entry<String, List<OneBlockInfo>> one = iter.next();
                racks.add(one.getKey());
                List<OneBlockInfo> blocks = one.getValue();

                // for each block, copy it into validBlocks. Delete it from
                // blockToNodes so that the same block does not appear in
                // two different splits.
                boolean createdSplit = false;
                for (OneBlockInfo oneblock : blocks) {
                    if (blockToNodes.containsKey(oneblock)) {
                        validBlocks.add(oneblock);
                        blockToNodes.remove(oneblock);
                        curSplitSize += oneblock.length;

                        // if the accumulated split size exceeds the maximum, then
                        // create this split.
                        if ((maxSize != 0 && curSplitSize >= maxSize)
                                || (maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
                            // create an input split and add it to the splits array
                            addCreatedSplit(job, splits, getHosts(racks), validBlocks);
                            addStatsForSplitType(SplitType.RACK_LOCAL, curSplitSize, getHosts(racks).size(),
                                    validBlocks.size());
                            createdSplit = true;
                            ++splitsAddedOnAllRacks;
                            break;
                        }
                    }
                }

                // if we created a split, then just go to the next rack
                if (createdSplit) {
                    curSplitSize = 0;
                    validBlocks.clear();
                    racks.clear();
                    continue;
                }

                if (!validBlocks.isEmpty()) {
                    if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
                        // if there is a mimimum size specified, then create a single split
                        // otherwise, store these blocks into overflow data structure
                        addCreatedSplit(job, splits, getHosts(racks), validBlocks);
                        addStatsForSplitType(SplitType.RACK_LOCAL_LEFTOVER, curSplitSize, getHosts(racks).size(),
                                validBlocks.size());
                        ++splitsAddedOnAllRacks;
                    } else if (!noRacksMadeSplit) {
                        // Add the blocks back if a pass on all rack found at least one
                        // split or this is the first pass
                        for (OneBlockInfo oneblock : validBlocks) {
                            blockToNodes.put(oneblock, oneblock.hosts);
                        }
                    } else {
                        // There were a few blocks in this rack that remained to be processed.
                        // Keep them in 'overflow' block list. These will be combined later.
                        overflowBlocks.addAll(validBlocks);
                    }
                }
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
            }

            if (splitsAddedOnAllRacks == 0) {
                noRacksMadeSplit = true;
            }
        }

        assert blockToNodes.isEmpty();
        assert curSplitSize == 0;
        assert validBlocks.isEmpty();
        assert racks.isEmpty();

        // Process all overflow blocks
        for (OneBlockInfo oneblock : overflowBlocks) {
            validBlocks.add(oneblock);
            curSplitSize += oneblock.length;

            // This might cause an exiting rack location to be re-added,
            // but it should be OK because racks is a Set.
            for (int i = 0; i < oneblock.racks.length; i++) {
                racks.add(oneblock.racks[i]);
            }

            // if the accumulated split size exceeds the maximum, then
            // create this split.
            if ((maxSize != 0 && curSplitSize >= maxSize)
                    || (maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
                // create an input split and add it to the splits array
                addCreatedSplit(job, splits, getHosts(racks), validBlocks);
                addStatsForSplitType(SplitType.OVERFLOW, curSplitSize, getHosts(racks).size(), validBlocks.size());
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
            }
        }

        // Process any remaining blocks, if any.
        if (!validBlocks.isEmpty()) {
            addCreatedSplit(job, splits, getHosts(racks), validBlocks);
            addStatsForSplitType(SplitType.OVERFLOW_LEFTOVER, curSplitSize, getHosts(racks).size(),
                    validBlocks.size());
        }
    }

    /**
     * Create a single split from the list of blocks specified in validBlocks
     * Add this new split into splitList.
     */
    private void addCreatedSplit(JobConf job, List<CombineFileSplit> splitList, Collection<String> locations,
            ArrayList<OneBlockInfo> validBlocks) {
        // create an input split
        Path[] fl = new Path[validBlocks.size()];
        long[] offset = new long[validBlocks.size()];
        long[] length = new long[validBlocks.size()];
        for (int i = 0; i < validBlocks.size(); i++) {
            fl[i] = validBlocks.get(i).onepath;
            offset[i] = validBlocks.get(i).offset;
            length[i] = validBlocks.get(i).length;
        }

        // add this split to the list that is returned
        CombineFileSplit thissplit = new CombineFileSplit(job, fl, offset, length,
                locations.toArray(new String[locations.size()]));
        splitList.add(thissplit);
    }

    /**
     * This is not implemented yet.
     */
    public abstract RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException;

    /**
     * information about one file from the File System
     */
    private static class OneFileInfo {
        private long fileSize; // size of the file
        private OneBlockInfo[] blocks; // all blocks in this file

        OneFileInfo(LocatedFileStatus stat, JobConf job, boolean isSplitable,
                HashMap<String, List<OneBlockInfo>> rackToBlocks, HashMap<OneBlockInfo, String[]> blockToNodes,
                HashMap<String, List<OneBlockInfo>> nodeToBlocks, HashMap<String, Set<String>> rackToNodes,
                long maxSize) throws IOException {
            this.fileSize = 0;

            // get block locations from file system
            BlockLocation[] locations = stat.getBlockLocations();

            // create a list of all block and their locations
            if (locations == null || locations.length == 0) {
                blocks = new OneBlockInfo[0];
            } else {
                if (!isSplitable) {
                    // if the file is not splitable, just create the one block with
                    // full file length
                    blocks = new OneBlockInfo[1];
                    fileSize = stat.getLen();
                    blocks[0] = new OneBlockInfo(stat.getPath(), 0, fileSize, locations[0].getHosts(),
                            locations[0].getTopologyPaths());
                } else {
                    ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>(locations.length);
                    for (int i = 0; i < locations.length; i++) {

                        fileSize += locations[i].getLength();

                        // each split can be a maximum of maxSize
                        long left = locations[i].getLength();
                        long myOffset = locations[i].getOffset();
                        long myLength = 0;
                        while (left > 0) {
                            if (maxSize == 0) {
                                myLength = left;
                            } else {
                                if (left > maxSize && left < 2 * maxSize) {
                                    // if remainder is between max and 2*max - then
                                    // instead of creating splits of size max, left-max we
                                    //  create splits of size left/2 and left/2.
                                    myLength = left / 2;
                                } else {
                                    myLength = Math.min(maxSize, left);
                                }
                            }
                            OneBlockInfo oneblock = new OneBlockInfo(stat.getPath(), myOffset, myLength,
                                    locations[i].getHosts(), locations[i].getTopologyPaths());
                            left -= myLength;
                            myOffset += myLength;

                            blocksList.add(oneblock);
                        }
                    }
                    blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);
                }

                for (OneBlockInfo oneblock : blocks) {
                    // add this block to the block --> node locations map
                    blockToNodes.put(oneblock, oneblock.hosts);

                    // For blocks that do not have host/rack information,
                    // assign to default  rack.
                    String[] racks = null;
                    if (oneblock.hosts.length == 0) {
                        racks = new String[] { NetworkTopology.DEFAULT_RACK };
                    } else {
                        racks = oneblock.racks;
                    }

                    // add this block to the rack --> block map
                    for (int j = 0; j < racks.length; j++) {
                        String rack = racks[j];
                        List<OneBlockInfo> blklist = rackToBlocks.get(rack);
                        if (blklist == null) {
                            blklist = new ArrayList<OneBlockInfo>();
                            rackToBlocks.put(rack, blklist);
                        }
                        blklist.add(oneblock);
                        if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
                            // Add this host to rackToNodes map
                            addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
                        }
                    }

                    // add this block to the node --> block map
                    for (int j = 0; j < oneblock.hosts.length; j++) {
                        String node = oneblock.hosts[j];
                        List<OneBlockInfo> blklist = nodeToBlocks.get(node);
                        if (blklist == null) {
                            blklist = new ArrayList<OneBlockInfo>();
                            nodeToBlocks.put(node, blklist);
                        }
                        blklist.add(oneblock);
                    }
                }
            }
        }

        long getLength() {
            return fileSize;
        }

        OneBlockInfo[] getBlocks() {
            return blocks;
        }
    }

    /**
     * information about one block from the File System
     */
    private static class OneBlockInfo {
        Path onepath; // name of this file
        long offset; // offset in file
        long length; // length of this block
        String[] hosts; // nodes on whch this block resides
        String[] racks; // network topology of hosts

        OneBlockInfo(Path path, long offset, long len, String[] hosts, String[] topologyPaths) {
            this.onepath = path;
            this.offset = offset;
            this.hosts = hosts;
            this.length = len;
            assert (hosts.length == topologyPaths.length || topologyPaths.length == 0);

            // if the file ystem does not have any rack information, then
            // use dummy rack location.
            if (topologyPaths.length == 0) {
                topologyPaths = new String[hosts.length];
                for (int i = 0; i < topologyPaths.length; i++) {
                    topologyPaths[i] = (new NodeBase(hosts[i], NetworkTopology.DEFAULT_RACK)).toString();
                }
            }

            // The topology paths have the host name included as the last
            // component. Strip it.
            this.racks = new String[topologyPaths.length];
            for (int i = 0; i < topologyPaths.length; i++) {
                this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
            }
        }
    }

    private static void addHostToRack(HashMap<String, Set<String>> rackToNodes, String rack, String host) {
        Set<String> hosts = rackToNodes.get(rack);
        if (hosts == null) {
            hosts = new HashSet<String>();
            rackToNodes.put(rack, hosts);
        }
        hosts.add(host);
    }

    private Set<String> getHosts(Set<String> racks) {
        Set<String> hosts = new HashSet<String>();
        for (String rack : racks) {
            if (rackToNodes.containsKey(rack)) {
                hosts.addAll(rackToNodes.get(rack));
            }
        }
        return hosts;
    }

    /**
      * Accept a path only if any one of filters given in the
      * constructor do.
      */
    private static class MultiPathFilter implements PathFilter {
        private List<PathFilter> filters;

        public MultiPathFilter() {
            this.filters = new ArrayList<PathFilter>();
        }

        public MultiPathFilter(List<PathFilter> filters) {
            this.filters = filters;
        }

        public void add(PathFilter one) {
            filters.add(one);
        }

        public boolean accept(Path path) {
            for (PathFilter filter : filters) {
                if (filter.accept(path)) {
                    return true;
                }
            }
            return false;
        }

        /**
         *
         * @param path
         * @param pathOnly whether to strip out the scheme/authority before passing
         * to the constituent filters
         * @return whether the path matches all of the filters
         */
        public boolean accept(Path path, boolean pathOnly) {
            Path pathToCheck = path;
            if (pathOnly) {
                pathToCheck = new Path(path.toUri().getPath());
            }
            return accept(pathToCheck);
        }

        public String toString() {
            StringBuffer buf = new StringBuffer();
            buf.append("[");
            for (PathFilter f : filters) {
                buf.append(f);
                buf.append(",");
            }
            buf.append("]");
            return buf.toString();
        }
    }
}