org.apache.hadoop.mapreduce.lib.input.FileInputFormat.java Source code

Introduction

Here is the source code for org.apache.hadoop.mapreduce.lib.input.FileInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.lib.input;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import java.util.concurrent.TimeUnit;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapred.LocatedFileStatusFetcher;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StopWatch;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

/** 
 * A base class for file-based {@link InputFormat}s.
 *
 * <p><code>FileInputFormat</code> is the base class for all file-based 
 * <code>InputFormat</code>s. This provides a generic implementation of
 * {@link #getSplits(JobContext)}.
 *
 * Implementations of <code>FileInputFormat</code> can also override the
 * {@link #isSplitable(JobContext, Path)} method to prevent input files
 * from being split-up in certain situations. Implementations that may
 * deal with non-splittable files <i>must</i> override this method, since
 * the default implementation assumes splitting is always possible.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
    public static final String INPUT_DIR = "mapreduce.input.fileinputformat.inputdir";
    public static final String SPLIT_MAXSIZE = "mapreduce.input.fileinputformat.split.maxsize";
    public static final String SPLIT_MINSIZE = "mapreduce.input.fileinputformat.split.minsize";
    public static final String PATHFILTER_CLASS = "mapreduce.input.pathFilter.class";
    public static final String NUM_INPUT_FILES = "mapreduce.input.fileinputformat.numinputfiles";
    public static final String INPUT_DIR_RECURSIVE = "mapreduce.input.fileinputformat.input.dir.recursive";
    public static final String INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS = "mapreduce.input.fileinputformat.input.dir.nonrecursive.ignore.subdirs";
    public static final String LIST_STATUS_NUM_THREADS = "mapreduce.input.fileinputformat.list-status.num-threads";
    public static final int DEFAULT_LIST_STATUS_NUM_THREADS = 1;

    private static final Logger LOG = LoggerFactory.getLogger(FileInputFormat.class);

    private static final double SPLIT_SLOP = 1.1; // 10% slop

    @Deprecated
    public enum Counter {
        BYTES_READ
    }

    private static final PathFilter hiddenFileFilter = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    /**
     * Proxy PathFilter that accepts a path only if all filters given in the
     * constructor do. Used by the listPaths() to apply the built-in
     * hiddenFileFilter together with a user provided one (if any).
     */
    private static class MultiPathFilter implements PathFilter {
        private List<PathFilter> filters;

        public MultiPathFilter(List<PathFilter> filters) {
            this.filters = filters;
        }

        public boolean accept(Path path) {
            for (PathFilter filter : filters) {
                if (!filter.accept(path)) {
                    return false;
                }
            }
            return true;
        }
    }

    /**
     * @param job
     *          the job to modify
     * @param inputDirRecursive
     */
    public static void setInputDirRecursive(Job job, boolean inputDirRecursive) {
        job.getConfiguration().setBoolean(INPUT_DIR_RECURSIVE, inputDirRecursive);
    }

    /**
     * @param job
     *          the job to look at.
     * @return should the files to be read recursively?
     */
    public static boolean getInputDirRecursive(JobContext job) {
        return job.getConfiguration().getBoolean(INPUT_DIR_RECURSIVE, false);
    }

    /**
     * Get the lower bound on split size imposed by the format.
     * @return the number of bytes of the minimal split for this format
     */
    protected long getFormatMinSplitSize() {
        return 1;
    }

    /**
     * Is the given filename splittable? Usually, true, but if the file is
     * stream compressed, it will not be.
     *
     * The default implementation in <code>FileInputFormat</code> always returns
     * true. Implementations that may deal with non-splittable files <i>must</i>
     * override this method.
     *
     * <code>FileInputFormat</code> implementations can override this and return
     * <code>false</code> to ensure that individual input files are never split-up
     * so that {@link Mapper}s process entire files.
     * 
     * @param context the job context
     * @param filename the file name to check
     * @return is this file splitable?
     */
    protected boolean isSplitable(JobContext context, Path filename) {
        return true;
    }

    /**
     * Set a PathFilter to be applied to the input paths for the map-reduce job.
     * @param job the job to modify
     * @param filter the PathFilter class use for filtering the input paths.
     */
    public static void setInputPathFilter(Job job, Class<? extends PathFilter> filter) {
        job.getConfiguration().setClass(PATHFILTER_CLASS, filter, PathFilter.class);
    }

    /**
     * Set the minimum input split size
     * @param job the job to modify
     * @param size the minimum size
     */
    public static void setMinInputSplitSize(Job job, long size) {
        job.getConfiguration().setLong(SPLIT_MINSIZE, size);
    }

    /**
     * Get the minimum split size
     * @param job the job
     * @return the minimum number of bytes that can be in a split
     */
    public static long getMinSplitSize(JobContext job) {
        return job.getConfiguration().getLong(SPLIT_MINSIZE, 1L);
    }

    /**
     * Set the maximum split size
     * @param job the job to modify
     * @param size the maximum split size
     */
    public static void setMaxInputSplitSize(Job job, long size) {
        job.getConfiguration().setLong(SPLIT_MAXSIZE, size);
    }

    /**
     * Get the maximum split size.
     * @param context the job to look at.
     * @return the maximum number of bytes a split can include
     */
    public static long getMaxSplitSize(JobContext context) {
        return context.getConfiguration().getLong(SPLIT_MAXSIZE, Long.MAX_VALUE);
    }

    /**
     * Get a PathFilter instance of the filter set for the input paths.
     *
     * @return the PathFilter instance set for the job, NULL if none has been set.
     */
    public static PathFilter getInputPathFilter(JobContext context) {
        Configuration conf = context.getConfiguration();
        Class<?> filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class);
        return (filterClass != null) ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null;
    }

    /** List input directories.
     * Subclasses may override to, e.g., select only files matching a regular
     * expression. 
     * 
     * @param job the job to list input paths for
     * @return array of FileStatus objects
     * @throws IOException if zero items.
     */
    protected List<FileStatus> listStatus(JobContext job) throws IOException {
        Path[] dirs = getInputPaths(job);
        if (dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        }

        // get tokens for all the required FileSystems..
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

        // Whether we need to recursive look into the directory structure
        boolean recursive = getInputDirRecursive(job);

        // creates a MultiPathFilter with the hiddenFileFilter and the
        // user provided one (if any).
        List<PathFilter> filters = new ArrayList<PathFilter>();
        filters.add(hiddenFileFilter);
        PathFilter jobFilter = getInputPathFilter(job);
        if (jobFilter != null) {
            filters.add(jobFilter);
        }
        PathFilter inputFilter = new MultiPathFilter(filters);

        List<FileStatus> result = null;

        int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
        StopWatch sw = new StopWatch().start();
        if (numThreads == 1) {
            result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
        } else {
            Iterable<FileStatus> locatedFiles = null;
            try {
                LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(
                        job.getConfiguration(), dirs, recursive, inputFilter, true);
                locatedFiles = locatedFileStatusFetcher.getFileStatuses();
            } catch (InterruptedException e) {
                throw new IOException("Interrupted while getting file statuses");
            }
            result = Lists.newArrayList(locatedFiles);
        }

        sw.stop();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS));
        }
        LOG.info("Total input files to process : " + result.size());
        return result;
    }

    private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
            boolean recursive) throws IOException {
        List<FileStatus> result = new ArrayList<FileStatus>();
        List<IOException> errors = new ArrayList<IOException>();
        for (int i = 0; i < dirs.length; ++i) {
            Path p = dirs[i];
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            FileStatus[] matches = fs.globStatus(p, inputFilter);
            if (matches == null) {
                errors.add(new IOException("Input path does not exist: " + p));
            } else if (matches.length == 0) {
                errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
            } else {
                for (FileStatus globStat : matches) {
                    if (globStat.isDirectory()) {
                        RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                        while (iter.hasNext()) {
                            LocatedFileStatus stat = iter.next();
                            if (inputFilter.accept(stat.getPath())) {
                                if (recursive && stat.isDirectory()) {
                                    addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                                } else {
                                    result.add(stat);
                                }
                            }
                        }
                    } else {
                        result.add(globStat);
                    }
                }
            }
        }

        if (!errors.isEmpty()) {
            throw new InvalidInputException(errors);
        }
        return result;
    }

    /**
     * Add files in the input path recursively into the results.
     * @param result
     *          The List to store all files.
     * @param fs
     *          The FileSystem.
     * @param path
     *          The input path.
     * @param inputFilter
     *          The input filter that can be used to filter files/dirs. 
     * @throws IOException
     */
    protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
            PathFilter inputFilter) throws IOException {
        RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path);
        while (iter.hasNext()) {
            LocatedFileStatus stat = iter.next();
            if (inputFilter.accept(stat.getPath())) {
                if (stat.isDirectory()) {
                    addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                    result.add(stat);
                }
            }
        }
    }

    /**
     * A factory that makes the split for this class. It can be overridden
     * by sub-classes to make sub-types
     */
    protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
        return new FileSplit(file, start, length, hosts);
    }

    /**
     * A factory that makes the split for this class. It can be overridden
     * by sub-classes to make sub-types
     */
    protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) {
        return new FileSplit(file, start, length, hosts, inMemoryHosts);
    }

    /** 
     * Generate the list of files and make them into FileSplits.
     * @param job the job context
     * @throws IOException
     */
    public List<InputSplit> getSplits(JobContext job) throws IOException {
        StopWatch sw = new StopWatch().start();
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);

        // generate splits
        List<InputSplit> splits = new ArrayList<InputSplit>();
        List<FileStatus> files = listStatus(job);

        boolean ignoreDirs = !getInputDirRecursive(job)
                && job.getConfiguration().getBoolean(INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, false);
        for (FileStatus file : files) {
            if (ignoreDirs && file.isDirectory()) {
                continue;
            }
            Path path = file.getPath();
            long length = file.getLen();
            if (length != 0) {
                BlockLocation[] blkLocations;
                if (file instanceof LocatedFileStatus) {
                    blkLocations = ((LocatedFileStatus) file).getBlockLocations();
                } else {
                    FileSystem fs = path.getFileSystem(job.getConfiguration());
                    blkLocations = fs.getFileBlockLocations(file, 0, length);
                }
                if (isSplitable(job, path)) {
                    long blockSize = file.getBlockSize();
                    long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                    long bytesRemaining = length;
                    while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                        int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                        splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                                blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                        bytesRemaining -= splitSize;
                    }

                    if (bytesRemaining != 0) {
                        int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                        splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                                blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    }
                } else { // not splitable
                    if (LOG.isDebugEnabled()) {
                        // Log only if the file is big enough to be splitted
                        if (length > Math.min(file.getBlockSize(), minSize)) {
                            LOG.debug("File is not splittable so no parallelization " + "is possible: "
                                    + file.getPath());
                        }
                    }
                    splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                            blkLocations[0].getCachedHosts()));
                }
            } else {
                //Create empty hosts array for zero length files
                splits.add(makeSplit(path, 0, length, new String[0]));
            }
        }
        // Save the number of input files for metrics/loadgen
        job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
        sw.stop();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                    + sw.now(TimeUnit.MILLISECONDS));
        }
        return splits;
    }

    protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
        return Math.max(minSize, Math.min(maxSize, blockSize));
    }

    protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
        for (int i = 0; i < blkLocations.length; i++) {
            // is the offset inside this block?
            if ((blkLocations[i].getOffset() <= offset)
                    && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
                return i;
            }
        }
        BlockLocation last = blkLocations[blkLocations.length - 1];
        long fileLength = last.getOffset() + last.getLength() - 1;
        throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
    }

    /**
     * Sets the given comma separated paths as the list of inputs 
     * for the map-reduce job.
     * 
     * @param job the job
     * @param commaSeparatedPaths Comma separated paths to be set as 
     *        the list of inputs for the map-reduce job.
     */
    public static void setInputPaths(Job job, String commaSeparatedPaths) throws IOException {
        setInputPaths(job, StringUtils.stringToPath(getPathStrings(commaSeparatedPaths)));
    }

    /**
     * Add the given comma separated paths to the list of inputs for
     *  the map-reduce job.
     * 
     * @param job The job to modify
     * @param commaSeparatedPaths Comma separated paths to be added to
     *        the list of inputs for the map-reduce job.
     */
    public static void addInputPaths(Job job, String commaSeparatedPaths) throws IOException {
        for (String str : getPathStrings(commaSeparatedPaths)) {
            addInputPath(job, new Path(str));
        }
    }

    /**
     * Set the array of {@link Path}s as the list of inputs
     * for the map-reduce job.
     * 
     * @param job The job to modify 
     * @param inputPaths the {@link Path}s of the input directories/files 
     * for the map-reduce job.
     */
    public static void setInputPaths(Job job, Path... inputPaths) throws IOException {
        Configuration conf = job.getConfiguration();
        Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]);
        StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString()));
        for (int i = 1; i < inputPaths.length; i++) {
            str.append(StringUtils.COMMA_STR);
            path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]);
            str.append(StringUtils.escapeString(path.toString()));
        }
        conf.set(INPUT_DIR, str.toString());
    }

    /**
     * Add a {@link Path} to the list of inputs for the map-reduce job.
     * 
     * @param job The {@link Job} to modify
     * @param path {@link Path} to be added to the list of inputs for 
     *            the map-reduce job.
     */
    public static void addInputPath(Job job, Path path) throws IOException {
        Configuration conf = job.getConfiguration();
        path = path.getFileSystem(conf).makeQualified(path);
        String dirStr = StringUtils.escapeString(path.toString());
        String dirs = conf.get(INPUT_DIR);
        conf.set(INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
    }

    // This method escapes commas in the glob pattern of the given paths.
    private static String[] getPathStrings(String commaSeparatedPaths) {
        int length = commaSeparatedPaths.length();
        int curlyOpen = 0;
        int pathStart = 0;
        boolean globPattern = false;
        List<String> pathStrings = new ArrayList<String>();

        for (int i = 0; i < length; i++) {
            char ch = commaSeparatedPaths.charAt(i);
            switch (ch) {
            case '{': {
                curlyOpen++;
                if (!globPattern) {
                    globPattern = true;
                }
                break;
            }
            case '}': {
                curlyOpen--;
                if (curlyOpen == 0 && globPattern) {
                    globPattern = false;
                }
                break;
            }
            case ',': {
                if (!globPattern) {
                    pathStrings.add(commaSeparatedPaths.substring(pathStart, i));
                    pathStart = i + 1;
                }
                break;
            }
            default:
                continue; // nothing special to do for this character
            }
        }
        pathStrings.add(commaSeparatedPaths.substring(pathStart, length));

        return pathStrings.toArray(new String[0]);
    }

    /**
     * Get the list of input {@link Path}s for the map-reduce job.
     * 
     * @param context The job
     * @return the list of input {@link Path}s for the map-reduce job.
     */
    public static Path[] getInputPaths(JobContext context) {
        String dirs = context.getConfiguration().get(INPUT_DIR, "");
        String[] list = StringUtils.split(dirs);
        Path[] result = new Path[list.length];
        for (int i = 0; i < list.length; i++) {
            result[i] = new Path(StringUtils.unEscapeString(list[i]));
        }
        return result;
    }

}