reconcile.hbase.mapreduce.ZipInputFormat.java Source code

Introduction

Here is the source code for reconcile.hbase.mapreduce.ZipInputFormat.java
Source

/*
 * Copyright (c) 2008, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National
 * Laboratory. Written by Teresa Cottom, cottom1@llnl.gov CODE-400187 All rights reserved. This file is part of
 * RECONCILE
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License (as published by the Free Software Foundation) version 2, dated June 1991. This program is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA For full text see license.txt
 */
package reconcile.hbase.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.FileNameMap;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class ZipInputFormat extends FileInputFormat<Text, Text> {
    public static final String MAX_ENTRY_FILES = "InputFormat.maxEntryFilesToProcess";
    public static final String NUM_FILES_PER_SPLIT = "InputFormat.numberFilesPerSplit";
    public static final String IGNORE_FILES_LARGER_THAN_IN_MB = "InputFormat.ignoreFilesLargerThanInMB";
    public static final String PROCESS_MIME_TYPES_ONLY = "InputFormat.processMimeTypes";
    public static final int bufSize = 1024;

    static final Log LOG = LogFactory.getLog(ZipInputFormat.class);

    public static class ZipEntrySplit extends InputSplit implements Writable {
        private Path file;
        private String entryPath;
        private long length;
        private JobContext context;

        public ZipEntrySplit() {
        }

        public ZipEntrySplit(Path file, String entryPath, long length, JobContext context) {
            this.file = file;
            this.entryPath = entryPath;
            this.length = length;
            this.context = context;
            LOG.info(
                    "ZipEntrySplit: file(" + file.toString() + ") entry(" + entryPath + ") length(" + length + ")");
        }

        /** The zip archive containing this split's data. */
        public Path getFile() {
            return file;
        }

        /** The path of the file within the zip archive. */
        public String getEntryPath() {
            return entryPath;
        }

        public String getFullEntryPath() {
            return file.toString() + "-" + entryPath;
        }

        /** The uncompressed size of the file within the zip archive. */
        @Override
        public long getLength() throws IOException {
            return length;
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(file.toString());
            out.writeUTF(entryPath);
            out.writeLong(length);
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            file = new Path(in.readUTF());
            entryPath = in.readUTF();
            length = in.readLong();
            LOG.info("ZipEntrySplit.readFields: file(" + file.toString() + ") entry(" + entryPath + ") length("
                    + length + ")");
        }

        @Override
        public String[] getLocations() throws IOException {
            TreeSet<String> hosts = new TreeSet<String>();

            FileSystem fs = FileSystem.get(context.getConfiguration());
            FileStatus status = fs.getFileStatus(file);
            for (BlockLocation block : fs.getFileBlockLocations(status, 0L, status.getLen())) {
                for (String name : block.getHosts()) {
                    hosts.add(name);
                }
            }
            return hosts.toArray(new String[hosts.size()]);
        }

    }

    public static class MultiZipSplit extends InputSplit implements Writable {
        public ZipEntrySplit[] splits;

        public MultiZipSplit() {
        }

        public MultiZipSplit(Collection<ZipEntrySplit> splitList) {
            splits = splitList.toArray(new ZipEntrySplit[splitList.size()]);
        }

        @Override
        public long getLength() throws IOException {
            long sum = 0;
            for (ZipEntrySplit split : splits) {
                sum += split.getLength();
            }
            return sum;
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeInt(splits.length);
            for (ZipEntrySplit split : splits) {
                split.write(out);
            }
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            int num = in.readInt();
            splits = new ZipEntrySplit[num];
            for (int i = 0; i < num; ++i) {
                splits[i] = new ZipEntrySplit();
                splits[i].readFields(in);
            }
        }

        @Override
        public String[] getLocations() throws IOException {
            TreeSet<String> hosts = new TreeSet<String>();
            for (ZipEntrySplit split : splits) {
                for (String host : split.getLocations()) {
                    hosts.add(host);
                }
            }
            return hosts.toArray(new String[hosts.size()]);
        }
    }

    private static class ProgressThread extends Thread {
        private final TaskAttemptContext _reporter;
        private final long _reportIntervall;

        public ProgressThread(TaskAttemptContext context, long reportIntervall) {
            _reporter = context;
            _reportIntervall = reportIntervall;
            setDaemon(true);
        }

        @Override
        public void run() {
            try {
                while (true) {
                    _reporter.progress();
                    sleep(_reportIntervall);
                }
            } catch (final InterruptedException e) {
                LOG.debug("progress thread stopped");
            }
        }

        public static ProgressThread start(TaskAttemptContext reporter, long reportIntervall) {
            ProgressThread thread = new ProgressThread(reporter, reportIntervall);
            thread.start();
            return thread;
        }
    }

    public class ZipEntryRecordReader extends RecordReader<Text, Text> {
        FileSystem fs;
        MultiZipSplit split;
        int index;
        ProgressThread thread;

        @Override
        public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
            split = (MultiZipSplit) arg0;
            index = -1;
            fs = FileSystem.get(arg1.getConfiguration());
            thread = ProgressThread.start(arg1, 1000);
        }

        @Override
        public void close() throws IOException {
            if (thread != null)
                thread.interrupt();
        }

        @Override
        public Text getCurrentKey() throws IOException, InterruptedException {
            return new Text(split.splits[index].getFullEntryPath());
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            byte[] value = loadZipEntry(fs, split.splits[index]);
            return new Text(value);
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return (1.0f * index) / split.splits.length;
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            ++index;
            return index < split.splits.length;
        }
    }

    @Override
    public RecordReader<Text, Text> createRecordReader(InputSplit arg0, TaskAttemptContext arg1)
            throws IOException, InterruptedException {
        RecordReader<Text, Text> reader = new ZipEntryRecordReader();
        reader.initialize(arg0, arg1);

        return reader;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path file) {
        FileSystem fs;
        try {
            fs = FileSystem.get(context.getConfiguration());
        } catch (IOException e) {
            e.printStackTrace();
            LOG.error("unable to connect to context based FileSystem to check if file is splitable");
            return false;
        }

        LOG.debug("Verifying ZIP format for file: " + file.toString());
        boolean splitable = true;
        ZipInputStream zis = null;

        try {
            zis = new ZipInputStream(fs.open(file));
            ZipEntry zipEntry = zis.getNextEntry();
            if (zipEntry == null) {
                throw new IOException("No entries found! Empty zip file: ");
            }
            LOG.debug("...ZIP format verification successful!");
        } catch (IOException ioe) {
            LOG.error("Exception encountered while trying to open and read ZIP input stream: " + ioe.toString());
            splitable = false;
        } finally {
            try {
                if (zis != null) {
                    zis.close();
                }
            } catch (IOException ioe) {
                LOG.error("Exception while trying to close ZIP input stream: " + ioe);
            }
        }
        return splitable;
    }

    private FileNameMap fileNameMap = URLConnection.getFileNameMap();

    private List<ZipEntrySplit> getZipFileEntries(JobContext context, FileSystem fs, Path[] zipFiles,
            Integer maxEntryFiles, Integer ignoreFilesLargerThanMB, List<String> processMimeTypes)
            throws IOException {
        ArrayList<ZipEntrySplit> splits = new ArrayList<ZipEntrySplit>();
        ZipInputStream zis = null;
        ZipEntry zipEntry = null;

        for (int i = 0; i < zipFiles.length; i++) {
            Path file = zipFiles[i];
            LOG.debug("Opening zip file: " + file.toString());
            try {
                zis = new ZipInputStream(fs.open(file));
                while ((zipEntry = zis.getNextEntry()) != null) {
                    if (maxEntryFiles != null && splits.size() == maxEntryFiles.intValue()) {
                        LOG.debug("Exceeded maximum number of splits.  End getSplits()");
                        return splits;
                    }

                    boolean processFile = true;

                    if (processMimeTypes.size() > 0) {
                        // Ensure that if process mime types were specified, that entry
                        // mime type meets that criteria
                        String mimeType = fileNameMap.getContentTypeFor(zipEntry.getName());
                        if (mimeType == null || (!processMimeTypes.contains(mimeType.toLowerCase()))) {
                            processFile = false;
                            LOG.debug("Ignoring entry file (" + zipEntry.getName() + " mimeType(" + mimeType
                                    + ") not in process list");
                        }
                    }

                    long byteCount = zipEntry.getSize();
                    /*
                    if (byteCount <= 0) {
                       // Read entry and figure out size for ourselves
                       byteCount = 0;
                       while (zis.available()==1) {
                          zis.read();
                          ++byteCount;
                       }
                    }
                    */
                    if (ignoreFilesLargerThanMB != null && byteCount > ignoreFilesLargerThanMB.intValue()) {
                        processFile = false;
                        LOG.debug("Ignoring entry file (" + zipEntry.getName() + ") which exceeds size limit");
                    }

                    if (processFile) {
                        LOG.debug("Creating split for zip entry: " + zipEntry.getName() + " Size: " + byteCount
                                + " Method: " + (ZipEntry.DEFLATED == zipEntry.getMethod() ? "DEFLATED" : "STORED")
                                + " Compressed Size: " + zipEntry.getCompressedSize());

                        ZipEntrySplit zipSplit = new ZipEntrySplit(file, zipEntry.getName(), zipEntry.getSize(),
                                context);
                        splits.add(zipSplit);
                    }
                    zis.closeEntry();
                }
            } finally {
                IOUtils.closeQuietly(zis);
            }

        }
        return splits;
    }

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        Integer maxEntryFiles = null;
        String value = context.getConfiguration().get(MAX_ENTRY_FILES);
        if (value != null) {
            maxEntryFiles = Integer.parseInt(value);
            LOG.info("Maximum number of zip entries:" + maxEntryFiles);
        }

        value = context.getConfiguration().get(NUM_FILES_PER_SPLIT, "1");
        int numFilesPerSplit = Integer.parseInt(value);
        LOG.info("Number of files per split:" + numFilesPerSplit);

        Integer ignoreFilesLargerThanMB = null;
        value = context.getConfiguration().get(IGNORE_FILES_LARGER_THAN_IN_MB);
        if (value != null) {
            ignoreFilesLargerThanMB = Integer.parseInt(value);
            LOG.info("Ignore entry files larger than (" + ignoreFilesLargerThanMB + ")MB in size");
        }

        List<String> processMimeTypes = new ArrayList<String>();
        value = context.getConfiguration().get(PROCESS_MIME_TYPES_ONLY);
        if (value != null) {
            for (String val : value.split(",")) {
                if (val != null && (val.length() != 0)) {
                    val = val.toLowerCase();
                    processMimeTypes.add(val);
                    LOG.info("Process entry files with mime type (" + val + ")");
                }
            }
        }

        FileSystem fs = FileSystem.get(context.getConfiguration());

        LOG.debug("Start splitting input ZIP files");

        Path[] files = getInputPaths(context);
        for (int i = 0; i < files.length; i++) { // check we have valid files
            Path file = files[i];
            FileStatus status = fs.getFileStatus(file);
            if (!fs.exists(file)) {
                throw new IOException("Input file provided (" + files[i] + ") does not exist.");
            }
            if (status.isDir()) {
                throw new IOException("Input file provided (" + files[i] + ") is not a file but a directory.");
            }
        }

        //  Get all entry zip splits
        List<ZipEntrySplit> splits = getZipFileEntries(context, fs, files, maxEntryFiles, ignoreFilesLargerThanMB,
                processMimeTypes);
        LOG.info("There are (" + splits.size() + ") zip entry splits");

        // Determine final number of combined zip entry splits
        ArrayList<InputSplit> finalSplits = new ArrayList<InputSplit>();
        int totalSplits = splits.size() / numFilesPerSplit;
        if ((splits.size() % numFilesPerSplit) != 0)
            ++totalSplits;
        LOG.info("There will be (" + totalSplits + ") MultiZipSplit(s)");

        // Group zip entry splits into MultiZipSplits 
        int begin = 0, end = 0;
        for (int i = 0; i < totalSplits; ++i) {
            end = begin + numFilesPerSplit;

            if ((i + 1) == totalSplits)
                end = splits.size();

            LOG.info("\t MultiZipSplit begin(" + begin + ") end(" + end + ")");
            MultiZipSplit split = new MultiZipSplit(splits.subList(begin, end));
            finalSplits.add(split);

            begin = end;
        }

        LOG.debug("End splitting input ZIP files.");
        return finalSplits;
    }

    private static byte[] read(String entry, ZipInputStream zis, int numBytes) {
        byte[] data = new byte[numBytes];
        int n = 0;
        try {
            while (zis.available() == 1 && (n < numBytes)) {
                data[n] = (byte) zis.read();
                ++n;
            }
        } catch (IOException e) {
            LOG.error("failure reading zip entry(" + entry + ")");
            e.printStackTrace();
            return null;
        }
        LOG.info("Read bytes(" + n + ") from entry (" + entry + ")");
        LOG.debug("Read value(" + Bytes.toString(data) + ") from entry (" + entry + ")");

        return data;
    }

    private static byte[] read(String entry, ZipInputStream zis) {
        ArrayList<byte[]> dataArray = new ArrayList<byte[]>();
        byte[] current = null;

        int i = 0;
        int n = 0;
        try {
            while (zis.available() == 1) {
                if (n % bufSize == 0) {
                    current = new byte[bufSize];
                    dataArray.add(current);
                    i = 0;
                }
                current[i] = (byte) zis.read();
                ++n;
                ++i;
            }
        } catch (IOException e) {
            LOG.error("failure reading zip entry(" + entry + ")");
            e.printStackTrace();
            return null;
        }
        --n;

        // Copy multiple buffers into single large buffer
        byte[] data = new byte[n];
        i = 0;
        for (byte[] buffer : dataArray) {
            int copyLength = bufSize;
            if ((i + copyLength) > n) {
                copyLength = n - i;
            }
            for (int j = 0; j < copyLength; ++j) {
                data[i] = buffer[j];
                ++i;
            }
        }

        LOG.info("Read bytes(" + n + ") from entry (" + entry + ")");
        LOG.debug("Read value(" + Bytes.toString(data) + ") from entry (" + entry + ")");

        return data;
    }

    public static byte[] loadZipEntry(FileSystem fs, ZipEntrySplit zipSplit) throws IOException {
        byte[] data = null;
        Path file = zipSplit.getFile();
        String entryPath = zipSplit.getEntryPath();

        ZipInputStream zis = null;
        try {
            zis = new ZipInputStream(fs.open(file));
            ZipEntry zipEntry = zis.getNextEntry();
            while (zipEntry != null && !zipEntry.getName().equals(entryPath)) {
                zipEntry = zis.getNextEntry();
            }
            if (zipEntry != null) {
                if (zipSplit.getLength() > 0) {
                    data = read(zipSplit.getEntryPath(), zis, (int) zipSplit.getLength());
                } else {
                    data = read(zipSplit.getEntryPath(), zis);
                }
            }
        } finally {
            IOUtils.closeQuietly(zis);
        }
        return data;
    }

}