com.ning.metrics.serialization.hadoop.SmileInputFormat.java Source code

Introduction

Here is the source code for com.ning.metrics.serialization.hadoop.SmileInputFormat.java
Source

/*
 * Copyright 2010-2011 Ning, Inc.
 *
 * Ning licenses this file to you under the Apache License, version 2.0
 * (the "License"); you may not use this file except in compliance with the
 * License.  You may obtain a copy of the License at:
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package com.ning.metrics.serialization.hadoop;

import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.InvalidInputException;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

public class SmileInputFormat extends InputFormat {
    /**
     * Get the list of input {@link Path}s for the map-reduce job.
     *
     * @param context The job
     * @return the list of input {@link Path}s for the map-reduce job.
     */
    public static Path[] getInputPaths(JobContext context) {
        String dirs = context.getConfiguration().get("mapred.input.dir", "");
        String[] list = StringUtils.split(dirs);
        Path[] result = new Path[list.length];
        for (int i = 0; i < list.length; i++) {
            result[i] = new Path(StringUtils.unEscapeString(list[i]));
        }
        return result;
    }

    /**
     * List input directories.
     *
     * @param job the job to list input paths for
     * @return array of FileStatus objects
     * @throws IOException if zero items.
     */
    protected List<FileStatus> listStatus(JobContext job) throws IOException {
        List<FileStatus> result = new ArrayList<FileStatus>();
        Path[] dirs = getInputPaths(job);
        if (dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        }

        // Get tokens for all the required FileSystems..
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

        List<IOException> errors = new ArrayList<IOException>();
        for (Path p : dirs) {
            FileSystem fs = p.getFileSystem(job.getConfiguration());
            final SmilePathFilter filter = new SmilePathFilter();
            FileStatus[] matches = fs.globStatus(p, filter);
            if (matches == null) {
                errors.add(new IOException("Input path does not exist: " + p));
            } else if (matches.length == 0) {
                errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
            } else {
                for (FileStatus globStat : matches) {
                    if (globStat.isDir()) {
                        Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                    } else {
                        result.add(globStat);
                    }
                }
            }
        }

        if (!errors.isEmpty()) {
            throw new InvalidInputException(errors);
        }

        return result;
    }

    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        final List<InputSplit> splits = new ArrayList<InputSplit>();
        final List<FileStatus> files = listStatus(jobContext);
        for (FileStatus file : files) {
            final Path path = file.getPath();
            final FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
            final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
            final List<String> blkHosts = new ArrayList<String>();
            for (final BlockLocation location : blkLocations) {
                blkHosts.addAll(Arrays.asList(location.getHosts()));
            }

            // TODO Split files =)
            final String[] hosts = blkHosts.toArray(new String[0]);
            splits.add(new FileSplit(path, 0, file.getLen(), hosts));
        }

        return splits;
    }

    @Override
    public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        taskAttemptContext.setStatus("Creating a SmileRecordReader");
        return new SmileRecordReader();
    }

    private class SmilePathFilter implements PathFilter {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(".smile");
        }
    }
}