gov.llnl.ontology.text.hbase.GzipTarInputFormat.java Source code

Introduction

Here is the source code for gov.llnl.ontology.text.hbase.GzipTarInputFormat.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.text.hbase;

import gov.llnl.text.util.FileUtils;

import org.apache.commons.codec.digest.DigestUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.tools.tar.TarEntry;
import org.apache.tools.tar.TarInputStream;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.util.ArrayList;
import java.util.List;

import java.util.zip.GZIPInputStream;

/**
 * A {@link FileInputFormat} for handling gzipped tarball files with each
 * internal file containing data for a single document.  This assumes that the
 * file, or files, being processed are in raw text format and contain one file
 * path per line of gzipped tarballs.  Each entry in the gzipped tarball will be
 * considered a single document.
 *
 * @author Keith Stevens
 */
public class GzipTarInputFormat extends FileInputFormat<ImmutableBytesWritable, Text> {

    /**
     * Returns a {@link GzipTarRecordReader}.  The record reader will return
     * each tarred file.
     */
    public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new GzipTarRecordReader();
    }

    /**
     * Returns a {@link List} of {@link FileSplit}s.  Each {@link FileSplit}
     * will be a gzipped tarball of xml documents.  Each tarred file should
     * contain a single document.
     */
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        List<InputSplit> splits = new ArrayList<InputSplit>();

        // Get the list of zipped files to be processed and add each zipped file
        // as an InputSplit.
        FileSystem fs = FileSystem.get(context.getConfiguration());
        for (Path file : getInputPaths(context)) {
            // Check that the list of files exists.  Throw an exception if it
            // does not.
            if (fs.isDirectory(file) || !fs.exists(file))
                throw new IOException("File does not exist: " + file);

            // Read the contents of the file list and add each line as a
            // FileSplit.
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(file)));
            for (String line = null; (line = br.readLine()) != null;)
                splits.add(new FileSplit(new Path(line), 0, Integer.MAX_VALUE, null));
        }
        return splits;
    }

    /**
     * A {@link RecordReader} for processing gzipped tarballs of document files.
     * It is assumed that each tarballed file is a single document, or will be
     * processed further by other stages.
     */
    public class GzipTarRecordReader extends RecordReader<ImmutableBytesWritable, Text> {

        /**
         * The current {@link ImmutableBytesWritable} key read.
         */
        private ImmutableBytesWritable currentKey;

        /**
         * The current {@link Text} document.
         */
        private Text currentDocument;

        /**
         * The {@link TarInputStream} used to read files.
         */
        private TarInputStream tarStream;

        /**
         * Contains the parent path of the gzipped tarball being processed by
         * this {@link GzipTarRecordReader}.
         */
        private String parentName;

        /**
         * Extract the {@link Path} for the file to be processed by this {@link
         * GzipTarRecordReader}.
         */
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            // Get the file Path for this input split.
            Configuration config = context.getConfiguration();
            FileSystem fs = FileSystem.get(config);
            FileSplit fileSplit = (FileSplit) split;
            Path filePath = fileSplit.getPath();
            parentName = filePath.getParent().getName();
            InputStream is = fs.open(filePath);

            System.err.println(filePath.toString());
            // Unzip the file and get a tarball reader.
            GZIPInputStream gis = new GZIPInputStream(is);
            tarStream = new TarInputStream(gis);
        }

        /**
         * Advances the reader one step to point to the next tarball file.  It
         * returns {@code null} when there are no more files in the tarball.
         */
        public boolean nextKeyValue() throws IOException {
            TarEntry tarEntry = null;

            // Iterate through the tar entries until a true file or the end of
            // the tarball is found.
            while ((tarEntry = tarStream.getNextEntry()) != null && tarEntry.isDirectory())
                ;

            // Return false when there are no more entries in the tarball.
            if (tarEntry == null)
                return false;

            // Set the current key.
            String key = parentName + tarEntry.getName();
            currentKey = new ImmutableBytesWritable(DigestUtils.shaHex(key).getBytes());

            // Set the current document.
            String document = FileUtils.readFile(new BufferedReader(new InputStreamReader(tarStream)));
            currentDocument = new Text(document);

            // successfully advanced.
            return true;
        }

        /**
         * {@inheritDoc}
         */
        public ImmutableBytesWritable getCurrentKey() {
            return currentKey;
        }

        /**
         * {@inheritDoc}
         */
        public Text getCurrentValue() {
            return currentDocument;
        }

        /**
         * {@inheritDoc}
         */
        public float getProgress() throws IOException, InterruptedException {
            return 1.0f;
        }

        /**
         * {@inheritDoc}
         */
        public void close() throws IOException {
        }
    }
}