edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java

Source

/** 
 * Author: Martin van Velsen <vvelsen@cs.cmu.edu>
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as 
 *  published by the Free Software Foundation, either version 3 of the 
 *  License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package edu.cmu.cs.in.hadoop;

import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;

import edu.cmu.cs.in.base.HoopRoot;
import edu.cmu.cs.in.base.HoopStringTools;

/*
//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
//import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
    
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
*/

import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;

/** 
 * @author vvelsen
 *
 */
public class HoopWholeFileRecordReader extends HoopRoot implements RecordReader<LongWritable, Text> {
    private LongWritable key = null;
    private Text value = null;
    private long fileSize = -1;
    private InputStream inStream = null;
    private JobConf job = null;
    private CompressionCodecFactory compressionCodecs = null;
    private int len = 0;
    private String internalKey = "";

    public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) {
        setClassName("HoopWholeFileRecordReader");
        debug("HoopWholeFileRecordReader ()");

        job = aJob;

        FileSplit split = (FileSplit) aSplit;

        //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);

        fileSize = split.getLength();

        final Path file = split.getPath();

        createKeyFromName(file.getName());

        debug("File/Key: " + internalKey + " with size: " + split.getLength());

        compressionCodecs = new CompressionCodecFactory(job);

        final CompressionCodec codec = compressionCodecs.getCodec(file);

        FileSystem fs = null;
        try {
            fs = file.getFileSystem(job);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        FSDataInputStream fileIn = null;

        try {
            fileIn = fs.open(split.getPath());
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        if (codec != null) {
            try {
                inStream = codec.createInputStream(fileIn);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else {

            inStream = fileIn;
        }
    }

    /**
     * Primarily this key generated is designed to turn document file names like
     * the ones of the form wiki-doc10.txt into a long integer 10. The code is
     * somewhat elaborate but this way it can extract any number from an encoded
     * filename and not just from the wiki dataset.
     */
    private String createKeyFromName(String aFileName) {
        String cleaned = aFileName.toLowerCase();

        //debug ("createKey ("+cleaned+")");

        LinkedList<String> numbers = HoopStringTools.StringToNumbers(cleaned);

        if (numbers == null)
            key.set(fileSize);

        internalKey = numbers.get(0);

        return (internalKey);
    }

    /**
     * The current progress of the record reader through its data.
     * 
     * Returns:
     *    a number between 0.0 and 1.0 that is the fraction of the data read 
     * Throws:
     *    IOException 
     *    InterruptedException
     */
    public float getProgress() {
        //debug ("getProgress ()");

        return (1.0f);
    }

    /**
     * 
     */
    public synchronized void close() throws IOException {
        debug("close ()");

        /*
        if (in!=null) 
        {
           in.close(); 
        }
        */
    }

    /**
     * 
     */
    @Override
    public boolean next(LongWritable key, Text value) throws IOException {
        debug("next ()");

        byte[] buffer = new byte[(int) fileSize];

        if (inStream == null) {
            debug("Internal error: no input stream available!");
            return (false);
        }

        len = inStream.read(buffer, 0, (int) fileSize);

        //debug ("Actually read: " +len);

        if (len != -1) {
            value.set(buffer, 0, len);
        } else {
            return (false);
        }

        debug("Read the file");

        return true;
    }

    /**
     * 
     */
    @Override
    public LongWritable createKey() {
        debug("createKey ()");

        return (new LongWritable());
    }

    /**
     * 
     */
    @Override
    public Text createValue() {
        debug("createValue ()");

        //return value;

        return (new Text());
    }

    /**
     * 
     */
    @Override
    public long getPos() throws IOException {
        return (len);
    }

    /**
     * 
     */
    public void setValue(Text value) {
        this.value = value;
    }

    /**
     * 
     */
    public Text getValue() {
        return value;
    }
}