authordetect.input.SingleBookReader.java Source code

Java tutorial

Introduction

Here is the source code for authordetect.input.SingleBookReader.java

Source

package authordetect.input;

import authordetect.structure.TextArrayWritable;
import authordetect.structure.WordCountMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
import authordetect.util.BookCounter;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;

/**
 * Created by Qiu on 4/24/15.
 * This record reader takes a single book as input.
 * Output key: Text ---> " Author(opt 0) or Book title(opt 1)  / Maximum Word Count "
 * Output value: TextArray ---> [ "Word A / Word A Count", ... ]
 */

public class SingleBookReader extends RecordReader<Text, TextArrayWritable> {

    private LineReader lineReader;
    private String title;
    private Text currentLine = new Text(""); //key
    private Text key; //key is book info
    private TextArrayWritable value; //value is words count array
    private long start, end, currentPos;
    private String filename;
    private boolean hasTitleOrAuthor = true;
    private boolean hasStart = true;
    private WordCountMap wordCountMap;
    private TaskAttemptContext context;
    private boolean isFinish = false;

    public SingleBookReader(TaskAttemptContext context) {
        wordCountMap = new WordCountMap();
        this.context = context;
    }

    /**
     * @param inputSplit
     * @param context    the information about the task
     * @throws java.io.IOException
     * @throws InterruptedException
     */
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSplit split = (FileSplit) inputSplit;
        Configuration configuration = context.getConfiguration();

        // get the option from configuration:
        // 0 for group by author, 1 for group by book
        int option = configuration.getInt("GROUP_OPTION", 0);

        Path path = split.getPath();
        filename = path.getName();
        FileSystem fileSystem = path.getFileSystem(configuration);
        FSDataInputStream inputStream = fileSystem.open(path);
        lineReader = new LineReader(inputStream, configuration);

        //initial start point and end point
        start = split.getStart();
        end = start + split.getLength();

        inputStream.seek(start);
        if (start != 0) {
            start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
        }

        start += lineReader.readLine(currentLine);

        prepareToScanBook(option);
    }

    /**
     * Preparation to process actual book content
     * Skip license content, project description etc.
     * Reduce bandwidth usage
     *
     * @throws IOException
     */
    private void prepareToScanBook(int opt) throws IOException {
        //get the title of the book
        while (!containsTitleOrAuthor(currentLine, opt)) {
            try {
                int readBytes = lineReader.readLine(currentLine);
                //if does not find line of title, return
                if (readBytes == 0 || !hasTitleOrAuthor) {
                    hasTitleOrAuthor = false;
                    return;
                }
                //update cursor of linereader
                start += readBytes;
            } catch (IOException e) {
                hasTitleOrAuthor = false;
                System.err.println("Error when retriving title for book ---> " + filename);
                System.err.println(e.getMessage());
                return;
            }

        }

        //get book start line
        while (!isBookStart(currentLine)) {
            try {
                int readBytes = lineReader.readLine(currentLine);
                //if does not find book start line, return
                if (readBytes == 0 || !hasStart) {
                    hasStart = false;
                    return;
                }
                //update cursor of linereader
                start += readBytes;
            } catch (IOException e) {
                hasStart = false;
                System.err.println("Error when retriving start line for book ---> " + filename);
                System.err.println(e.getMessage());
                return;
            }
        }

        currentPos = start;
    }

    private boolean containsTitleOrAuthor(Text line, int option) throws IOException {
        String lineString = line.toString();
        String target;

        if (option == 0) {
            target = "Author";
            if (lineString.startsWith(target)) {
                title = lineString.split(":")[1].substring(1);
                return true;
            } else {
                return false;
            }
        } else {
            target = "Title";
            if (lineString.startsWith(target)) {
                title = lineString.split(":")[1].substring(1);
            }
            if (lineString.startsWith("Author")) {
                String author = lineString.split(":")[1].substring(1);
                title = title.concat("_" + author);
                return true;
            }
            return false;
        }

    }

    private boolean isBookStart(Text line) {
        String lineString = line.toString();
        return lineString.toLowerCase().contains("start") && lineString.toLowerCase().contains("gutenberg");
    }

    /**
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (!filename.endsWith("txt")) {//only process txt file
            return false;
        }

        if (currentPos >= end || !hasTitleOrAuthor || !hasStart) {//false if finishes processing the split
            return false;
        }

        if (!isFinish) {
            processBookContent();
            return true;
        } else {
            return false;
        }
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public TextArrayWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    private void processBookContent() throws IOException {

        currentPos += lineReader.readLine(currentLine);
        String currentLineStr = currentLine.toString().toLowerCase();

        //Processing book content line by line. And update the word map
        while (!isFinish) {
            String[] words = currentLineStr.split(" ");
            //write all words into the word map
            for (String word : words) {
                word = word.trim().replaceAll("[^a-zA-Z0-9]", "").toLowerCase();
                if (!word.equals("")) {
                    wordCountMap.put(word, 1);
                }
            }
            //detect book end
            if (currentLineStr.contains("end") && currentLineStr.contains("gutenberg")) {
                isFinish = true;

                //update counter which stores the book count
                Counter counter = context.getCounter(BookCounter.BOOK_COUNT);
                counter.increment(1);
            }
            currentPos += lineReader.readLine(currentLine);
            currentLineStr = currentLine.toString().toLowerCase();
        }

        //convert word map to text array
        int arrayLen = wordCountMap.entrySet().size();
        Iterator<Map.Entry<String, Integer>> iterator = wordCountMap.entrySet().iterator();
        int maxCount = 0, count;
        String word, wordCount;
        Text[] wordArray = new Text[arrayLen];

        for (int i = 0; i < arrayLen; i++) {
            Map.Entry<String, Integer> entry = iterator.next();
            word = entry.getKey();
            count = entry.getValue();
            wordCount = word + "/" + count;
            wordArray[i] = new Text(wordCount);

            if (count > maxCount) {//get the maximum word count as well
                maxCount = count;
            }
        }

        key = new Text(title + "/" + maxCount);
        value = new TextArrayWritable(wordArray);
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (start == end) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (currentPos - start) / (float) (end - start));
        }
    }

    @Override
    public void close() throws IOException {
        lineReader.close();
    }

}