name.abhijitsarkar.hadoop.citation.CitationCombiner.java Source code

Java tutorial

Introduction

Here is the source code for name.abhijitsarkar.hadoop.citation.CitationCombiner.java

Source

/*******************************************************************************
 * Copyright (c) 2014, the original author or authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * A copy of the GNU General Public License accompanies this software, 
 * and is also available at http://www.gnu.org/licenses.
 *******************************************************************************/
package name.abhijitsarkar.hadoop.citation;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class works on the cite.txt. It outputs a space separated key-value pair where the key is a citation ID and the
 * value is a comma-separated list of citations that refer to the key.
 * 
 * @author Abhijit Sarkar
 */
public class CitationCombiner extends Configured implements Tool {
    public static final Logger LOGGER = LoggerFactory.getLogger(CitationCombiner.class);

    public static class CitationMapper extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            LOGGER.debug("Key: {}, Value: {}.", key, value);

            /* Skip the header row */
            try {
                Long.valueOf(key.toString());

                output.collect(value, key);
            } catch (NumberFormatException nfe) {
                LOGGER.warn("NumberFormatException: {}.", nfe.getMessage());
            }
        }
    }

    public static class CitationReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        static final byte[] COMMA = new Text(",").getBytes();

        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Text value = null;
            Text citations = new Text();

            while (values.hasNext()) {
                value = values.next();

                if (value != null && value.getLength() > 0) {
                    citations.append(value.getBytes(), 0, value.getLength());

                    if (values.hasNext()) {
                        citations.append(COMMA, 0, 1);
                    }
                }
            }

            LOGGER.debug("Key: {}, Citations: {}.", key, citations.toString());

            output.collect(key, citations);
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("citation-combiner");

        /* This is to set the separator byte for KeyValueTextInputFormat */
        conf.set("key.value.separator.in.input.line", ",");

        conf.setMapperClass(CitationMapper.class);
        conf.setReducerClass(CitationReducer.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setInputFormat(KeyValueTextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);

        return 0;
    }

    public static void main(String[] args) throws Exception {
        // TODO: Get rid of the GenericOptionsParser and just pass the args to the ToolRunner
        GenericOptionsParser parser = new GenericOptionsParser(new Configuration(), args);

        ToolRunner.run(new CitationCombiner(), parser.getRemainingArgs());
    }
}