cascading.tap.hadoop.ZipInputFormat.java Source code

Introduction

Here is the source code for cascading.tap.hadoop.ZipInputFormat.java
Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tap.hadoop;

import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/**
 * Class ZipInputFormat is an {@link InputFormat} for zip files. Each file within a zip file is broken
 * into lines. Either line-feed or carriage-return are used to signal end of
 * line. Keys are the position in the file, and values are the line of text.
 * <p/>
 * If the underlying {@link FileSystem} is HDFS or FILE, each {@link ZipEntry} is returned
 * as a unique split. Otherwise this input format returns false for isSplitable, and will
 * subsequently iterate over each ZipEntry and treat all internal files as the 'same' file.
 */
public class ZipInputFormat extends FileInputFormat<LongWritable, Text> implements JobConfigurable {
    public void configure(JobConf conf) {

    }

    /**
     * Return true only if the file is in ZIP format.
     *
     * @param fs   the file system that the file is on
     * @param file the path that represents this file
     * @return is this file splitable?
     */
    protected boolean isSplitable(FileSystem fs, Path file) {
        if (!isAllowSplits(fs))
            return false;

        if (LOG.isDebugEnabled())
            LOG.debug("verifying ZIP format for file: " + file.toString());

        boolean splitable = true;
        ZipInputStream zipInputStream = null;

        try {
            zipInputStream = new ZipInputStream(fs.open(file));
            ZipEntry zipEntry = zipInputStream.getNextEntry();

            if (zipEntry == null)
                throw new IOException("no entries found, empty zip file");

            if (LOG.isDebugEnabled())
                LOG.debug("ZIP format verification successful");
        } catch (IOException exception) {
            LOG.error("exception encountered while trying to open and read ZIP input stream", exception);
            splitable = false;
        } finally {
            safeClose(zipInputStream);
        }

        return splitable;
    }

    protected Path[] listPathsInternal(JobConf jobConf) throws IOException {
        Path[] dirs = FileInputFormat.getInputPaths(jobConf);

        if (dirs.length == 0)
            throw new IOException("no input paths specified in job");

        for (Path dir : dirs) {
            FileSystem fs = dir.getFileSystem(jobConf);

            if (!fs.isFile(dir))
                throw new IOException("does not support directories: " + dir);
        }

        return dirs;
    }

    @Override
    protected FileStatus[] listStatus(JobConf jobConf) throws IOException {
        Path[] paths = listPathsInternal(jobConf);
        FileStatus[] statuses = new FileStatus[paths.length];

        for (int i = 0; i < paths.length; i++) {
            Path path = paths[i];
            statuses[i] = path.getFileSystem(jobConf).getFileStatus(path);
        }

        return statuses;
    }

    /**
     * Splits files returned by {@link #listPathsInternal(JobConf)}. Each file is
     * expected to be in zip format and each split corresponds to
     * {@link ZipEntry}.
     *
     * @param job       the JobConf data structure, see {@link JobConf}
     * @param numSplits the number of splits required. Ignored here
     * @throws IOException if input files are not in zip format
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        if (LOG.isDebugEnabled())
            LOG.debug("start splitting input ZIP files");

        Path[] files = listPathsInternal(job);

        for (int i = 0; i < files.length; i++) { // check we have valid files
            Path file = files[i];
            FileSystem fs = file.getFileSystem(job);

            if (!fs.isFile(file) || !fs.exists(file))
                throw new IOException("not a file: " + files[i]);
        }

        // generate splits
        ArrayList<ZipSplit> splits = new ArrayList<ZipSplit>(numSplits);

        for (int i = 0; i < files.length; i++) {
            Path file = files[i];
            FileSystem fs = file.getFileSystem(job);

            if (LOG.isDebugEnabled())
                LOG.debug("opening zip file: " + file.toString());

            if (isAllowSplits(fs))
                makeSplits(job, splits, fs, file);
            else
                makeSplit(job, splits, file);
        }

        if (LOG.isDebugEnabled())
            LOG.debug("end splitting input ZIP files");

        return splits.toArray(new ZipSplit[splits.size()]);
    }

    private void makeSplit(JobConf job, ArrayList<ZipSplit> splits, Path file) throws IOException {
        if (LOG.isDebugEnabled())
            LOG.debug("creating split for zip: " + file);

        // unknown uncompressed size. if set to compressed size, data will be truncated
        splits.add(new ZipSplit(file, -1));
    }

    private void makeSplits(JobConf job, ArrayList<ZipSplit> splits, FileSystem fs, Path file) throws IOException {
        ZipInputStream zipInputStream = new ZipInputStream(fs.open(file));

        try {
            ZipEntry zipEntry;

            while ((zipEntry = zipInputStream.getNextEntry()) != null) {
                ZipSplit zipSplit = new ZipSplit(file, zipEntry.getName(), zipEntry.getSize());

                if (LOG.isDebugEnabled())
                    LOG.debug(String.format(
                            "creating split for zip entry: %s size: %d method: %s compressed size: %d",
                            zipEntry.getName(), zipEntry.getSize(),
                            ZipEntry.DEFLATED == zipEntry.getMethod() ? "DEFLATED" : "STORED",
                            zipEntry.getCompressedSize()));

                splits.add(zipSplit);
            }
        } finally {
            safeClose(zipInputStream);
        }
    }

    public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter)
            throws IOException {
        reporter.setStatus(genericSplit.toString());

        ZipSplit split = (ZipSplit) genericSplit;
        Path file = split.getPath();
        long length = split.getLength();

        // Set it max value if length is unknown.
        // Setting length to Max value does not have
        // a side effect as Record reader would not be
        // able to read past the actual size of
        // current entry.
        length = length == -1 ? Long.MAX_VALUE - 1 : length;

        FileSystem fs = file.getFileSystem(job);

        FSDataInputStream inputStream = fs.open(file);

        if (isAllowSplits(fs))
            return getReaderForEntry(inputStream, split, length);
        else
            return getReaderForAll(inputStream);
    }

    private RecordReader<LongWritable, Text> getReaderForAll(final FSDataInputStream inputStream)
            throws IOException {
        final long bytesSize[] = new long[] { 0 };
        final long bytesRead[] = new long[] { 0 };

        Enumeration<InputStream> enumeration = new Enumeration<InputStream>() {
            boolean returnCurrent = false;
            ZipEntry nextEntry;
            ZipInputStream zipInputStream = new ZipInputStream(inputStream);
            InputStream closeableInputStream = makeInputStream(zipInputStream);

            public boolean hasMoreElements() {
                if (returnCurrent)
                    return nextEntry != null;

                getNext();

                return nextEntry != null;
            }

            public InputStream nextElement() {
                if (returnCurrent) {
                    returnCurrent = false;
                    return closeableInputStream;
                }

                getNext();

                if (nextEntry == null)
                    throw new IllegalStateException("no more zip entries in zip input stream");

                return closeableInputStream;
            }

            private void getNext() {
                try {
                    nextEntry = zipInputStream.getNextEntry();

                    while (nextEntry != null && nextEntry.isDirectory())
                        nextEntry = zipInputStream.getNextEntry();

                    if (nextEntry != null)
                        bytesSize[0] += nextEntry.getSize();

                    returnCurrent = true;
                } catch (IOException exception) {
                    throw new RuntimeException("could not get next zip entry", exception);
                } finally {
                    // i think, better than sending across a fake input stream that closes the zip
                    if (nextEntry == null)
                        safeClose(zipInputStream);
                }
            }

            private InputStream makeInputStream(ZipInputStream zipInputStream) {
                return new FilterInputStream(zipInputStream) {
                    @Override
                    public int read() throws IOException {
                        bytesRead[0]++;
                        return super.read();
                    }

                    @Override
                    public int read(byte[] bytes) throws IOException {
                        int result = super.read(bytes);
                        bytesRead[0] += result;
                        return result;
                    }

                    @Override
                    public int read(byte[] bytes, int i, int i1) throws IOException {
                        int result = super.read(bytes, i, i1);
                        bytesRead[0] += result;
                        return result;
                    }

                    @Override
                    public long skip(long l) throws IOException {
                        long result = super.skip(l);
                        bytesRead[0] += result;
                        return result;
                    }

                    @Override
                    public void close() throws IOException {
                        // do nothing
                    }
                };
            }
        };

        return new LineRecordReader(new SequenceInputStream(enumeration), 0, Long.MAX_VALUE, Integer.MAX_VALUE) {
            @Override
            public float getProgress() {
                if (0 == bytesSize[0])
                    return 0.0f;
                else
                    return Math.min(1.0f, bytesRead[0] / (float) bytesSize[0]);
            }
        };
    }

    private RecordReader<LongWritable, Text> getReaderForEntry(FSDataInputStream inputStream, ZipSplit split,
            long length) throws IOException {
        ZipInputStream zipInputStream = new ZipInputStream(inputStream);
        String entryPath = split.getEntryPath();

        ZipEntry zipEntry = zipInputStream.getNextEntry();

        while (zipEntry != null && !zipEntry.getName().equals(entryPath))
            zipEntry = zipInputStream.getNextEntry();

        return new LineRecordReader(zipInputStream, 0, length, Integer.MAX_VALUE);
    }

    protected boolean isAllowSplits(FileSystem fs) {
        // only allow if fs is local or dfs
        URI uri = fs.getUri();
        String scheme = uri.getScheme();

        return scheme.equalsIgnoreCase("hdfs") || scheme.equalsIgnoreCase("file");
    }

    private void safeClose(ZipInputStream zipInputStream) {
        try {
            if (zipInputStream != null)
                zipInputStream.close();
        } catch (IOException exception) {
            LOG.error("exception while trying to close ZIP input stream", exception);
        }
    }

}