org.apache.hadoop.mapreduce.approx.lib.input.SampleTextInputFormat.java Source code

Introduction

Here is the source code for org.apache.hadoop.mapreduce.approx.lib.input.SampleTextInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.approx.lib.input;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.HashSet;
import java.util.List;
import java.util.HashMap;
import java.util.Set;
import java.util.Iterator;
import java.util.Map;
import java.util.Arrays;

import org.apache.commons.lang3.ArrayUtils;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.approx.SegmentsMap;
import org.apache.hadoop.mapreduce.approx.SegmentsMap.Segment;
import org.apache.log4j.Logger;

public abstract class SampleTextInputFormat<K, V> extends FileInputFormat<K, V> {
    private static final Logger LOG = Logger.getLogger("Subset.InputFormat");
    public static final String SPLIT_MINSIZE_PERNODE = "mapreduce.input.fileinputformat.split.minsize.per.node";
    public static final String SPLIT_MINSIZE_PERRACK = "mapreduce.input.fileinputformat.split.minsize.per.rack";
    // ability to limit the size of a single split
    private long maxSplitSize = 0;
    private long minSplitSizeNode = 0;
    private long minSplitSizeRack = 0;
    private boolean blockunit = false;
    // mapping from a rack name to the set of Nodes in the rack 
    private HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();

    public SampleTextInputFormat() {
    }

    protected void setMaxSplitSize(long maxSplitSize) {
        this.maxSplitSize = maxSplitSize;
    }

    /**
     * Specify the minimum size (in bytes) of each split per node.
     * This applies to data that is left over after combining data on a single
     * node into splits that are of maximum size specified by maxSplitSize.
     * This leftover data will be combined into its own split if its size
     * exceeds minSplitSizeNode.
     */
    protected void setMinSplitSizeNode(long minSplitSizeNode) {
        this.minSplitSizeNode = minSplitSizeNode;
    }

    /**
     * Specify the minimum size (in bytes) of each split per rack.
     * This applies to data that is left over after combining data on a single
     * rack into splits that are of maximum size specified by maxSplitSize.
     * This leftover data will be combined into its own split if its size
     * exceeds minSplitSizeRack.
     */
    protected void setMinSplitSizeRack(long minSplitSizeRack) {
        this.minSplitSizeRack = minSplitSizeRack;
    }

    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {

        long minSizeNode = 0;
        long minSizeRack = 0;
        long maxSize = 0;
        Configuration conf = job.getConfiguration();
        blockunit = conf.getBoolean("map.input.block.unit", false);

        // the values specified by setxxxSplitSize() takes precedence over the
        // values that might have been specified in the config
        if (minSplitSizeNode != 0) {
            minSizeNode = minSplitSizeNode;
        } else {
            minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
        }
        if (minSplitSizeRack != 0) {
            minSizeRack = minSplitSizeRack;
        } else {
            minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
        }
        if (maxSplitSize != 0) {
            maxSize = maxSplitSize;
        } else {
            maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 67108864);
        }
        if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
            throw new IOException("Minimum split size pernode " + minSizeNode
                    + " cannot be larger than maximum split size " + maxSize);
        }
        if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
            throw new IOException("Minimum split size per rack" + minSizeRack
                    + " cannot be larger than maximum split size " + maxSize);
        }
        if (minSizeRack != 0 && minSizeNode > minSizeRack) {
            throw new IOException("Minimum split size per node" + minSizeNode
                    + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
        }

        // all the files in input set

        // all the files in input set
        Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
        List<InputSplit> splits = new ArrayList<InputSplit>();
        if (paths.length == 0) {
            return splits;
        }

        // Convert them to Paths first. This is a costly operation and 
        // we should do it first, otherwise we will incur doing it multiple
        // times, one time each for each pool in the next loop.
        List<Path> newpaths = new LinkedList<Path>();
        for (int i = 0; i < paths.length; i++) {
            FileSystem fs = paths[i].getFileSystem(conf);
            Path p = fs.makeQualified(paths[i]);
            newpaths.add(p);
        }
        paths = null;

        // create splits for all files that are not in any pool.
        getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

        // free up rackToNodes map
        rackToNodes.clear();
        return splits;
    }

    /**
     * Return all the splits in the specified set of paths
     */
    private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack,
            List<InputSplit> splits) throws IOException {
        Configuration conf = job.getConfiguration();

        // all blocks for all the files in input set
        OneFileInfo[] files;

        // mapping from a rack name to the list of blocks it has
        HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

        // mapping from a block to the nodes on which it has replicas
        HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

        // mapping from a node to the list of blocks that it contains
        HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

        files = new OneFileInfo[paths.length];
        if (paths.length == 0) {
            return;
        }

        // populate all the blocks for all files
        //***************************************sampling info*************************************
        //long totLength = 0;
        for (int i = 0; i < paths.length; i++) {
            files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes,
                    nodeToBlocks, rackToNodes, maxSize);
            //totLength += files[i].getLength();
        }

        ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
        Set<String> nodes = new HashSet<String>();
        long curSplitSize = 0;

        // process all nodes and create splits that are local
        // to a node. 
        for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
                .hasNext();) {

            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
            nodes.add(one.getKey());
            List<OneBlockInfo> blocksInNode = one.getValue();

            // for each block, copy it into validBlocks. Delete it from 
            // blockToNodes so that the same block does not appear in 
            // two different splits.
            for (OneBlockInfo oneblock : blocksInNode) {
                if (blockToNodes.containsKey(oneblock)) {
                    validBlocks.add(oneblock);
                    blockToNodes.remove(oneblock);
                    //*******************************************segments compose splits****************
                    curSplitSize += oneblock.length;
                    if (blockunit) {
                        addCreatedSplit1(splits, validBlocks);
                        curSplitSize = 0;
                        validBlocks.clear();
                        continue;
                    }
                    // if the accumulated split size exceeds the maximum, then 
                    // create this split.
                    if (maxSize != 0 && curSplitSize >= maxSize) {
                        // create an input split and add it to the splits array
                        addCreatedSplit(splits, nodes, validBlocks);
                        curSplitSize = 0;
                        validBlocks.clear();
                    }
                }
            }
            // if there were any blocks left over and their combined size is
            // larger than minSplitNode, then combine them into one split.
            // Otherwise add them back to the unprocessed pool. It is likely 
            // that they will be combined with other blocks from the 
            // same rack later on.
            if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
                // create an input split and add it to the splits array
                addCreatedSplit(splits, nodes, validBlocks);
            } else {
                for (OneBlockInfo oneblock : validBlocks) {
                    blockToNodes.put(oneblock, oneblock.hosts);
                }
            }
            validBlocks.clear();
            nodes.clear();
            curSplitSize = 0;
        }

        // if blocks in a rack are below the specified minimum size, then keep them
        // in 'overflow'. After the processing of all racks is complete, these 
        // overflow blocks will be combined into splits.
        ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
        Set<String> racks = new HashSet<String>();

        // Process all racks over and over again until there is no more work to do.
        while (blockToNodes.size() > 0) {

            // Create one split for this rack before moving over to the next rack. 
            // Come back to this rack after creating a single split for each of the 
            // remaining racks.
            // Process one rack location at a time, Combine all possible blocks that
            // reside on this rack as one split. (constrained by minimum and maximum
            // split size).

            // iterate over all racks 
            for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
                    .hasNext();) {

                Map.Entry<String, List<OneBlockInfo>> one = iter.next();
                racks.add(one.getKey());
                List<OneBlockInfo> blocks = one.getValue();

                // for each block, copy it into validBlocks. Delete it from 
                // blockToNodes so that the same block does not appear in 
                // two different splits.
                boolean createdSplit = false;
                for (OneBlockInfo oneblock : blocks) {
                    if (blockToNodes.containsKey(oneblock)) {
                        validBlocks.add(oneblock);
                        blockToNodes.remove(oneblock);
                        curSplitSize += oneblock.length;

                        // if the accumulated split size exceeds the maximum, then 
                        // create this split.
                        if (maxSize != 0 && curSplitSize >= maxSize) {
                            // create an input split and add it to the splits array
                            addCreatedSplit(splits, getHosts(racks), validBlocks);
                            createdSplit = true;
                            break;
                        }
                    }
                }

                // if we created a split, then just go to the next rack
                if (createdSplit) {
                    curSplitSize = 0;
                    validBlocks.clear();
                    racks.clear();
                    continue;
                }

                if (!validBlocks.isEmpty()) {
                    if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
                        // if there is a minimum size specified, then create a single split
                        // otherwise, store these blocks into overflow data structure
                        addCreatedSplit(splits, getHosts(racks), validBlocks);
                    } else {
                        // There were a few blocks in this rack that 
                        // remained to be processed. Keep them in 'overflow' block list. 
                        // These will be combined later.
                        overflowBlocks.addAll(validBlocks);
                    }
                }
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
            }
        }

        assert blockToNodes.isEmpty();
        assert curSplitSize == 0;
        assert validBlocks.isEmpty();
        assert racks.isEmpty();

        // Process all overflow blocks
        for (OneBlockInfo oneblock : overflowBlocks) {
            validBlocks.add(oneblock);
            curSplitSize += oneblock.length;

            // This might cause an exiting rack location to be re-added,
            // but it should be ok.
            for (int i = 0; i < oneblock.racks.length; i++) {
                racks.add(oneblock.racks[i]);
            }

            // if the accumulated split size exceeds the maximum, then 
            // create this split.
            if (maxSize != 0 && curSplitSize >= maxSize) {
                // create an input split and add it to the splits array
                addCreatedSplit(splits, getHosts(racks), validBlocks);
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
            }
        }

        // Process any remaining blocks, if any.
        if (!validBlocks.isEmpty()) {
            addCreatedSplit(splits, getHosts(racks), validBlocks);
        }
    }

    /**
     * Create a single split from the list of blocks specified in validBlocks
     * Add this new split into splitList.
     */

    private void addCreatedSplit1(List<InputSplit> splitList, ArrayList<OneBlockInfo> validBlocks) {
        OneBlockInfo oneblockinfo = validBlocks.get(0);
        SampleFileSplit thissplit = new SampleFileSplit(oneblockinfo.onepath, oneblockinfo.segOffset,
                oneblockinfo.segLength, oneblockinfo.segKeys, oneblockinfo.segWeights, oneblockinfo.hosts);
        splitList.add(thissplit);
        int index = splitList.size() - 1;
        for (int i = 0; i < oneblockinfo.segOffset.length; i++) {
            LOG.info("split:" + String.valueOf(index) + " segment:" + String.valueOf(i) + " offset:"
                    + String.valueOf(oneblockinfo.segOffset[i]) + " length:"
                    + String.valueOf(oneblockinfo.segLength[i]) + " key:" + oneblockinfo.segKeys[i]);
        }
    }

    private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations,
            ArrayList<OneBlockInfo> validBlocks) {
        // create an input split
        Path fl = validBlocks.get(0).onepath;
        long[] offset = null;
        long[] length = null;
        String[] key = null;
        String[] weight = null;
        for (int i = 0; i < validBlocks.size(); i++) {
            //fl[i] = validBlocks.get(i).onepath; 
            offset = ArrayUtils.addAll(offset, validBlocks.get(i).segOffset);
            length = ArrayUtils.addAll(length, validBlocks.get(i).segLength);
            key = ArrayUtils.addAll(key, validBlocks.get(i).segKeys);
            weight = ArrayUtils.addAll(weight, validBlocks.get(i).segWeights);
        }

        // add this split to the list that is returned
        SampleFileSplit thissplit = new SampleFileSplit(fl, offset, length, key, weight,
                locations.toArray(new String[0]));
        splitList.add(thissplit);
        int index = splitList.size() - 1;
        for (int i = 0; i < offset.length; i++) {
            LOG.info("split:" + String.valueOf(index) + " segment:" + String.valueOf(i) + " offset:"
                    + String.valueOf(offset[i]) + " length:" + String.valueOf(length[i]) + " key:" + key[i]);
        }
    }

    /**
     * This is not implemented yet. 
     */
    public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException;

    /**
     * information about one file from the File System
     */

    //sampling info add***********************************************************************
    private static class OneFileInfo {
        private long fileSize; // size of the file
        private OneBlockInfo[] blocks; // all blocks in this file

        OneFileInfo(Path path, Configuration conf, boolean isSplitable,
                HashMap<String, List<OneBlockInfo>> rackToBlocks, HashMap<OneBlockInfo, String[]> blockToNodes,
                HashMap<String, List<OneBlockInfo>> nodeToBlocks, HashMap<String, Set<String>> rackToNodes,
                long maxSize) throws IOException {
            this.fileSize = 0;

            // get block locations from file system
            FileSystem fs = path.getFileSystem(conf);
            FileStatus stat = fs.getFileStatus(path);
            BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, stat.getLen());
            // get all sample segments

            SegmentsMap smap = new SegmentsMap(conf, path);
            Segment[] sampleSegList = smap.getSampleSegmentsList();
            LOG.info("sampled segments:" + String.valueOf(sampleSegList.length));
            // create a list of all block and their locations
            if (locations == null) {
                blocks = new OneBlockInfo[0];
            } else {
                if (!isSplitable) {
                    // if the file is not splitable, just create the one block with
                    // full file length
                    //blocks = new OneBlockInfo[1];
                    //fileSize = stat.getLen();
                    //blocks[0] = new OneBlockInfo(path, 0, fileSize, locations[0]
                    //.getHosts(), locations[0].getTopologyPaths());
                } else {
                    ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>(locations.length);
                    for (int i = 0, j = 0; i < locations.length; i++) {
                        fileSize += locations[i].getLength();
                        //**************************segments to block*************************************
                        // each split can be a maximum of maxSize
                        long blklength = locations[i].getLength();
                        long blkOffset = locations[i].getOffset();
                        long[] segmentOffsets = null;
                        long[] segmentLengths = null;
                        ArrayList<String> segmentKeys = new ArrayList<String>();
                        ArrayList<String> segmentWeights = new ArrayList<String>();
                        int k = j;
                        while (j < sampleSegList.length && sampleSegList[j].getOffset() >= blkOffset
                                && sampleSegList[j].getOffset() < blkOffset + blklength) {
                            segmentOffsets = ArrayUtils.addAll(segmentOffsets,
                                    new long[] { sampleSegList[j].getOffset() });
                            segmentLengths = ArrayUtils.addAll(segmentLengths,
                                    new long[] { sampleSegList[j].getLength() });
                            segmentKeys.add(sampleSegList[j].getKeys());
                            segmentWeights.add(sampleSegList[j].getWeights());
                            j++;
                        }
                        if (j == k) {
                            continue;
                        }
                        long[] myOffset = segmentOffsets;
                        long[] myLength = segmentLengths;
                        String[] mykey = segmentKeys.toArray(new String[segmentKeys.size()]);
                        String[] myweight = segmentWeights.toArray(new String[segmentWeights.size()]);

                        //******************************************add segment info*************************
                        OneBlockInfo oneblock = new OneBlockInfo(path, myOffset, myLength, mykey, myweight,
                                locations[i].getHosts(), locations[i].getTopologyPaths());
                        //left -= myLength;
                        //myOffset += myLength;

                        blocksList.add(oneblock);
                    }
                    blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);
                }

                for (OneBlockInfo oneblock : blocks) {
                    // add this block to the block --> node locations map
                    blockToNodes.put(oneblock, oneblock.hosts);

                    // For blocks that do not have host/rack information,
                    // assign to default  rack.
                    String[] racks = null;
                    if (oneblock.hosts.length == 0) {
                        racks = new String[] { NetworkTopology.DEFAULT_RACK };
                    } else {
                        racks = oneblock.racks;
                    }

                    // add this block to the rack --> block map
                    for (int j = 0; j < racks.length; j++) {
                        String rack = racks[j];
                        List<OneBlockInfo> blklist = rackToBlocks.get(rack);
                        if (blklist == null) {
                            blklist = new ArrayList<OneBlockInfo>();
                            rackToBlocks.put(rack, blklist);
                        }
                        blklist.add(oneblock);
                        if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
                            // Add this host to rackToNodes map
                            addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
                        }
                    }

                    // add this block to the node --> block map
                    for (int j = 0; j < oneblock.hosts.length; j++) {
                        String node = oneblock.hosts[j];
                        List<OneBlockInfo> blklist = nodeToBlocks.get(node);
                        if (blklist == null) {
                            blklist = new ArrayList<OneBlockInfo>();
                            nodeToBlocks.put(node, blklist);
                        }
                        blklist.add(oneblock);
                    }
                }
            }
        }

        long getLength() {
            return fileSize;
        }

        OneBlockInfo[] getBlocks() {
            return blocks;
        }
    }

    /**
     * information about one block from the File System
     */
    private static class OneBlockInfo {
        Path onepath; // name of this file
        long[] segOffset; // offset in file
        long[] segLength;
        String[] segWeights;
        String[] segKeys;
        long length; // length of this block
        String[] hosts; // nodes on which this block resides
        String[] racks; // network topology of hosts

        OneBlockInfo(Path path, long[] offset, long[] len, String[] keys, String[] weights, String[] hosts,
                String[] topologyPaths) {
            this.onepath = path;
            this.segOffset = offset;
            this.hosts = hosts;
            this.segLength = len;
            this.segWeights = weights;
            this.segKeys = keys;
            this.length = 0;
            for (long onelen : len) {
                this.length += onelen;
            }
            assert (hosts.length == topologyPaths.length || topologyPaths.length == 0);

            // if the file system does not have any rack information, then
            // use dummy rack location.
            if (topologyPaths.length == 0) {
                topologyPaths = new String[hosts.length];
                for (int i = 0; i < topologyPaths.length; i++) {
                    topologyPaths[i] = (new NodeBase(hosts[i], NetworkTopology.DEFAULT_RACK)).toString();
                }
            }

            // The topology paths have the host name included as the last 
            // component. Strip it.
            this.racks = new String[topologyPaths.length];
            for (int i = 0; i < topologyPaths.length; i++) {
                this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
            }
        }
    }

    protected BlockLocation[] getFileBlockLocations(FileSystem fs, FileStatus stat) throws IOException {
        return fs.getFileBlockLocations(stat, 0, stat.getLen());
    }

    private static void addHostToRack(HashMap<String, Set<String>> rackToNodes, String rack, String host) {
        Set<String> hosts = rackToNodes.get(rack);
        if (hosts == null) {
            hosts = new HashSet<String>();
            rackToNodes.put(rack, hosts);
        }
        hosts.add(host);
    }

    private Set<String> getHosts(Set<String> racks) {
        Set<String> hosts = new HashSet<String>();
        for (String rack : racks) {
            if (rackToNodes.containsKey(rack)) {
                hosts.addAll(rackToNodes.get(rack));
            }
        }
        return hosts;
    }
}