com.hadoop.mapred.DeprecatedLzoTextInputFormat.java Source code

Introduction

Here is the source code for com.hadoop.mapred.DeprecatedLzoTextInputFormat.java
Source

/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * <http://www.gnu.org/licenses/>.
 */

package com.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoInputFormatCommon;
import com.hadoop.compression.lzo.LzopCodec;

/**
 * This class conforms to the old (org.apache.hadoop.mapred.*) hadoop API style 
 * which is deprecated but still required in places.  Streaming, for example, 
 * does a check that the given input format is a descendant of 
 * org.apache.hadoop.mapred.InputFormat, which any InputFormat-derived class
 * from the new API fails.  In order for streaming to work, you must use
 * com.hadoop.mapred.DeprecatedLzoTextInputFormat, not 
 * com.hadoop.mapreduce.LzoTextInputFormat.  The classes attempt to be alike in
 * every other respect.
 * <p>
 * Note that to use this input format properly with hadoop-streaming, you should
 * also set the property <code>stream.map.input.ignoreKey=true</code>. That will
 * replicate the behavior of the default TextInputFormat by stripping off the byte
 * offset keys from the input lines that get piped to the mapper process.
 * <p>
 * See {@link LzoInputFormatCommon} for a description of the boolean property
 * <code>lzo.text.input.format.ignore.nonlzo</code> and how it affects the
 * behavior of this input format.
*/

@SuppressWarnings("deprecation")
public class DeprecatedLzoTextInputFormat extends TextInputFormat {
    private final Map<Path, LzoIndex> indexes = new HashMap<Path, LzoIndex>();

    @Override
    protected FileStatus[] listStatus(JobConf conf) throws IOException {
        List<FileStatus> files = new ArrayList<FileStatus>(Arrays.asList(super.listStatus(conf)));

        boolean ignoreNonLzo = LzoInputFormatCommon.getIgnoreNonLzoProperty(conf);

        Iterator<FileStatus> it = files.iterator();
        while (it.hasNext()) {
            FileStatus fileStatus = it.next();
            Path file = fileStatus.getPath();

            if (!LzoInputFormatCommon.isLzoFile(file.toString())) {
                // Get rid of non-LZO files, unless the conf explicitly tells us to
                // keep them.
                // However, always skip over files that end with ".lzo.index", since
                // they are not part of the input.
                if (ignoreNonLzo || LzoInputFormatCommon.isLzoIndexFile(file.toString())) {
                    it.remove();
                }
            } else {
                FileSystem fs = file.getFileSystem(conf);
                LzoIndex index = LzoIndex.readIndex(fs, file);
                indexes.put(file, index);
            }
        }

        return files.toArray(new FileStatus[] {});
    }

    @Override
    protected boolean isSplitable(FileSystem fs, Path filename) {
        if (LzoInputFormatCommon.isLzoFile(filename.toString())) {
            LzoIndex index = indexes.get(filename);
            return !index.isEmpty();
        } else {
            // Delegate non-LZO files to the TextInputFormat base class.
            return super.isSplitable(fs, filename);
        }
    }

    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        FileSplit[] splits = (FileSplit[]) super.getSplits(conf, numSplits);
        // Find new starts/ends of the filesplit that align with the LZO blocks.

        List<FileSplit> result = new ArrayList<FileSplit>();

        for (FileSplit fileSplit : splits) {
            Path file = fileSplit.getPath();
            FileSystem fs = file.getFileSystem(conf);

            if (!LzoInputFormatCommon.isLzoFile(file.toString())) {
                // non-LZO file, keep the input split as is.
                result.add(fileSplit);
                continue;
            }

            // LZO file, try to split if the .index file was found
            LzoIndex index = indexes.get(file);
            if (index == null) {
                throw new IOException("Index not found for " + file);
            }
            if (index.isEmpty()) {
                // Empty index, keep it as is.
                result.add(fileSplit);
                continue;
            }

            long start = fileSplit.getStart();
            long end = start + fileSplit.getLength();

            long lzoStart = index.alignSliceStartToIndex(start, end);
            long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

            if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) {
                result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
            }
        }

        return result.toArray(new FileSplit[result.size()]);
    }

    @Override
    public RecordReader<LongWritable, Text> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
            throws IOException {
        FileSplit fileSplit = (FileSplit) split;
        if (LzoInputFormatCommon.isLzoFile(fileSplit.getPath().toString())) {
            reporter.setStatus(split.toString());
            return new DeprecatedLzoLineRecordReader(conf, (FileSplit) split);
        } else {
            // delegate non-LZO files to the TextInputFormat base class.
            return super.getRecordReader(split, conf, reporter);
        }
    }
}