com.mcd.gdw.daas.mapreduce.ZipFileRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for com.mcd.gdw.daas.mapreduce.ZipFileRecordReader.java

Source

package com.mcd.gdw.daas.mapreduce;

/**
* Copyright 2011 Michael Cutler <m@cotdp.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;

import javax.naming.Context;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.mortbay.jetty.EofException;

/**
* This RecordReader implementation extracts individual files from a ZIP
* file and hands them over to the Mapper. The "key" is the decompressed
* file name, the "value" is the file contents.
*/
public class ZipFileRecordReader extends RecordReader<Text, BytesWritable> {
    /** InputStream used to read the ZIP file from the FileSystem */
    private FSDataInputStream fsin;

    /** ZIP file parser/decompresser */
    private ZipInputStream zip;

    /** Uncompressed file name */
    private Text currentKey;

    /** Uncompressed file contents */
    private BytesWritable currentValue;
    //    private Text currentValue = new Text();

    /** Used to indicate progress */
    private boolean isFinished = false;

    long MAX_FILE_SIZE = 0;
    String skipFilesonSize = "true";

    private String zipfilename;
    TaskAttemptContext taskAttemptContext;

    Path path = null;

    /**
    * Initialise and open the ZIP file from the FileSystem
    */
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {

        this.taskAttemptContext = taskAttemptContext;
        FileSplit split = (FileSplit) inputSplit;
        Configuration conf = taskAttemptContext.getConfiguration();
        path = split.getPath();
        FileSystem fs = path.getFileSystem(conf);
        zipfilename = path.getName();
        // Open the stream
        fsin = fs.open(path);
        zip = new ZipInputStream(fsin);

        skipFilesonSize = taskAttemptContext.getConfiguration().get("skipFilesonSize");

        if (taskAttemptContext.getConfiguration().get("MAX_FILE_SIZE") != null)
            MAX_FILE_SIZE = Long.parseLong(taskAttemptContext.getConfiguration().get("MAX_FILE_SIZE"));

    }

    /**
    * This is where the magic happens, each ZipEntry is decompressed and
    * readied for the Mapper. The contents of each file is held *in memory*
    * in a BytesWritable object.
    *
    * If the ZipFileInputFormat has been set to Lenient (not the default),
    * certain exceptions will be gracefully ignored to prevent a larger job
    * from failing.
    */
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    byte[] bytesfrombos = null;
    byte[] temp = null;
    boolean isFirstTime = true;

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        //      BytesWritable bw = new BytesWritable();

        if (bos == null)
            bos = new ByteArrayOutputStream();
        else {
            bos.reset();
            bos = null;
            bos = new ByteArrayOutputStream();
        }
        if (bytesfrombos != null) {
            Arrays.fill(bytesfrombos, (byte) 0);
            bytesfrombos = null;
        }
        if (temp != null) {
            Arrays.fill(temp, (byte) 0);
        } else {
            temp = new byte[8192];
        }

        ZipEntry entry = null;
        try {
            entry = zip.getNextEntry();
        } catch (ZipException e) {
            if (ZipFileInputFormat.getLenient() == false) {
                e.printStackTrace();
                currentValue = new BytesWritable(
                        ("CORRUPT_FILE:File " + path.getName() + " is corrupt").getBytes());
                return true;
                //               throw e;
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        // Sanity check
        if (entry == null) {

            if (isFirstTime) {
                currentValue = new BytesWritable(
                        ("CORRUPT_FILE:File " + path.getName() + " is corrupt").getBytes());
                isFinished = true;
                isFirstTime = false;
                return true;
            } else
                return false;
        } else {
            isFirstTime = false;
        }

        // Filename
        currentKey = new Text(entry.getName());

        //        if(! (entry.getName().startsWith("POS_")) && ! (entry.getName().startsWith("TL")) && ! (entry.getName().startsWith("POS35")) &&  !entry.getName().contains("Content_Types")){
        //           System.out.println( " file " + entry.getName() + " is not valid from zipfile "+zipfilename);
        //           nextKeyValue();
        //        }
        //        boolean returnvalue = true;
        //        try{
        //          System.out.println(" path name " + path.getName() + " uri " + path.toUri().toString());
        //           ZipFile zipFile = new ZipFile(path.getFileSystem(taskAttemptContext.getConfiguration()).getUri().toString());
        //           
        //        }catch(Exception ex){
        //           if(returnvalue == false)
        //              return false;
        //           
        //           currentValue = new BytesWritable( ("CORRUPT_FILE:File "+ path.getName() + " is corrupt").getBytes());
        //           ex.printStackTrace();
        //           returnvalue = false;
        //           return true;
        //           
        //        }

        if ("true".equalsIgnoreCase(skipFilesonSize) && (entry.getSize() > MAX_FILE_SIZE)) {

            currentValue = new BytesWritable(("FILE_SIZE_EXCEEDS_ALLOWED_LIMIT_" + MAX_FILE_SIZE).getBytes());
            System.out.println(" Skipping file because it exceeds max file size " + entry.getName());
            return true;
        }

        // Read the file contents

        while (true) {

            int bytesRead = 0;
            try {
                Arrays.fill(temp, (byte) 0);
                bytesRead = zip.read(temp, 0, 8192);
            } catch (EofException e) {
                if (ZipFileInputFormat.getLenient() == false)
                    throw e;
                return false;
            } catch (Exception e) {
                e.printStackTrace();
                //                if ( ZipFileInputFormat.getLenient() == false )
                //                    throw e;
                currentValue = new BytesWritable(
                        ("CORRUPT_FILE:File " + path.getName() + " is corrupt").getBytes());
                return true;
                //                return false;
            }
            if (bytesRead > 0) {
                bos.write(temp, 0, bytesRead);

            } else {

                break;
            }
        }
        zip.closeEntry();

        // Uncompressed contents
        bytesfrombos = bos.toByteArray();
        currentValue = new BytesWritable(bytesfrombos);

        return true;
    }

    /**
    * Rather than calculating progress, we just keep it simple
    */
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return isFinished ? 1 : 0;
    }

    /**
    * Returns the current key (name of the zipped file)
    */
    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return currentKey;
    }

    /**
    * Returns the current value (contents of the zipped file)
    */
    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return currentValue;
    }

    /**
    * Close quietly, ignoring any exceptions
    */
    @Override
    public void close() throws IOException {
        if (currentValue != null) {
            currentValue.setSize(0);
            currentValue = null;
        }

        if (bos != null) {
            bos.reset();
            bos.close();
            bos = null;
        }
        if (bytesfrombos != null) {
            Arrays.fill(bytesfrombos, (byte) 0);
            bytesfrombos = null;
        }
        if (temp != null) {
            Arrays.fill(temp, (byte) 0);
            temp = null;
        }

        try {
            zip.close();
        } catch (Exception ignore) {
        }
        try {
            fsin.close();
        } catch (Exception ignore) {
        }

    }
}