de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 
 * @author Steffen Remus
 */
public class RemoveExactDuplicatesJob extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new RemoveExactDuplicatesJob(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), RemoveExactDuplicatesJob.class);
        conf.setJobName(RemoveExactDuplicatesJob.class.getSimpleName());

        conf.setMapperClass(LineMapper.class);
        conf.setReducerClass(KeyReducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        // delete output path for testing purposes
        // FileSystem.get(conf).delete(new Path(args[1]), true);

        JobClient.runJob(conf);
        return 0;
    }

    public static class LineMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, NullWritable> {
        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, NullWritable> output, Reporter reporter)
                throws IOException {
            reporter.progress();
            output.collect(value, NullWritable.get());
        }
    }

    public static class KeyReducer extends MapReduceBase
            implements Reducer<Text, NullWritable, Text, NullWritable> {
        @Override
        public void reduce(Text key, Iterator<NullWritable> values, OutputCollector<Text, NullWritable> output,
                Reporter reporter) throws IOException {
            reporter.progress();
            output.collect(key, NullWritable.get());
        }
    }

}