name.abhijitsarkar.hadoop.citation.CitationCombinerWithChaining.java Source code

Java tutorial

Introduction

Here is the source code for name.abhijitsarkar.hadoop.citation.CitationCombinerWithChaining.java

Source

/*******************************************************************************
 * Copyright (c) 2014, the original author or authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * A copy of the GNU General Public License accompanies this software, 
 * and is also available at http://www.gnu.org/licenses.
 *******************************************************************************/
package name.abhijitsarkar.hadoop.citation;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.ChainMapper;
import org.apache.hadoop.mapred.lib.ChainReducer;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class works on the cite.txt. For the purposes of learning, it chains two mappers. It outputs a space separated
 * key-value pair where the key is a citation ID and the value is a comma-separated list of citations that refer to the
 * key.
 * 
 * @author Abhijit Sarkar
 */
public class CitationCombinerWithChaining extends Configured implements Tool {
    public static final String COMMA = ",";
    public static final Logger LOGGER = LoggerFactory.getLogger(CitationCombinerWithChaining.class);

    public static class CitationInputSplitMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, Text> {

        public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            final String[] lineSplit = value.toString().split(COMMA);

            String keyStr = null;
            String valueStr = null;

            if (lineSplit.length == 2) {
                keyStr = lineSplit[0];
                valueStr = lineSplit[1];

                output.collect(new Text(keyStr), new Text(valueStr));
            }
        }
    }

    public static class CitationHeaderStripMapper extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {

            /* Skip the header row */
            try {
                Long.valueOf(key.toString());

                output.collect(value, key);
            } catch (NumberFormatException nfe) {
                LOGGER.warn("NumberFormatException: {}.", nfe.getMessage());
            }
        }
    }

    public static class CitationReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        static final byte[] COMMA_BYTES = new Text(COMMA).getBytes();

        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Text value = null;
            Text citations = new Text();

            while (values.hasNext()) {
                value = values.next();

                if (value != null && value.getLength() > 0) {
                    citations.append(value.getBytes(), 0, value.getLength());

                    if (values.hasNext()) {
                        citations.append(COMMA_BYTES, 0, 1);
                    }
                }
            }

            LOGGER.debug("Key: {}, Citations: {}.", key, citations.toString());

            output.collect(key, citations);
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("citation-combiner-with-chaining");

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobConf map1Conf = new JobConf(false);

        ChainMapper.addMapper(conf, CitationInputSplitMapper.class, LongWritable.class, Text.class, Text.class,
                Text.class, true, map1Conf);

        JobConf map2Conf = new JobConf(false);

        ChainMapper.addMapper(conf, CitationHeaderStripMapper.class, Text.class, Text.class, Text.class, Text.class,
                true, map2Conf);

        JobConf red1Conf = new JobConf(false);

        ChainReducer.setReducer(conf, CitationReducer.class, Text.class, Text.class, Text.class, Text.class, true,
                red1Conf);

        JobClient.runJob(conf);

        return 0;
    }

    public static void main(String[] args) throws Exception {
        GenericOptionsParser parser = new GenericOptionsParser(new Configuration(), args);

        ToolRunner.run(new CitationCombinerWithChaining(), parser.getRemainingArgs());
    }
}