de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

Source

package de.tudarmstadt.lt.wiki.statistics;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;

import de.tudarmstadt.lt.util.WikiUtil;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 * Counts the number of in-links per resource. To do so, lines of the following
 * format:
 * "link text" \t link_target \t Sentence containing <head>link text</head>
 * are mapped to key-value pairs of the form
 * link_target -> 1
 * and then reduced to
 * link_target -> count(link_target)
 */
public class ResourceInlinkCount {
    public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
        HashMap<String, String> redirects = new HashMap<String, String>();

        public Map() {
            try {
                Path path = new Path("wiki/enwiki-redirects.txt");
                FileSystem fs = FileSystem.get(new Configuration());
                BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
                String line;
                while ((line = br.readLine()) != null) {
                    //               System.out.print('+');
                    String[] parts = line.split("\t");
                    if (parts.length == 2) {
                        String from = formatResourceName(parts[0]);
                        String to = formatResourceName(parts[1]);
                        //                  System.out.println("REDIRECT : " + from + " -> " + to);
                        redirects.put(from, to);
                    } else {
                        System.err.println("Error: redirect line does not have 2 columns!: " + line);
                    }
                }
                System.out.println();
                System.out.println("Done reading redirects. Entries: " + redirects.size());
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        private String formatResourceName(String resource) {
            return resource.replace(' ', '_');
        }

        @Override
        public void map(LongWritable key, Text Value, Context context) throws IOException, InterruptedException {
            String line = Value.toString();
            String parts[] = line.split("\t");
            // A valid line has 3 spits: link text, link target and context
            if (parts.length == 3) {
                String to = WikiUtil.getLinkedResource(redirects, parts[1]);
                context.write(new Text(to), new IntWritable(1));
            }
        }
    }

    public static class Map2 extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] parts = line.split("\t");
            if (parts.length == 2) {
                int count = Integer.parseInt(parts[1]);
                //         System.out.println("### MAP2 " + value + " -> 1" );
                context.write(new IntWritable(count), new IntWritable(1));
            }
        }
    }

    public static boolean runJob1(String inDir, String outDir) throws Exception {
        Configuration conf = new Configuration();
        conf.set("mapred.child.java.opts", "-Xmx1200M");
        conf.set("mapred.job.map.memory.mb", "1280");
        conf.set("mapreduce.job.queuename", "smalljob");
        Job job = Job.getInstance(conf);
        job.setJarByClass(ResourceInlinkCount.class);
        FileInputFormat.addInputPath(job, new Path(inDir));
        FileOutputFormat.setOutputPath(job, new Path(outDir));
        job.setMapperClass(Map.class);
        job.setReducerClass(IntSumReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setInputFormatClass(TextInputFormat.class);
        return job.waitForCompletion(true);
    }

    public static boolean runJob2(String inDir, String outDir) throws Exception {
        Configuration conf = new Configuration();
        conf.set("mapreduce.job.queuename", "smalljob");
        Job job = Job.getInstance(conf);
        job.setJarByClass(ResourceInlinkCount.class);
        FileInputFormat.addInputPath(job, new Path(inDir));
        FileOutputFormat.setOutputPath(job, new Path(outDir));
        job.setMapperClass(Map2.class);
        job.setReducerClass(IntSumReducer.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);
        job.setInputFormatClass(TextInputFormat.class);
        return job.waitForCompletion(true);
    }

    public static void main(String[] args) throws Exception {
        String inDir = args[0];
        String outDir = args[1];
        String tmpDir = outDir + "_inlink_counts";
        System.out.println("1+");
        boolean success = runJob1(inDir, tmpDir);
        System.out.println("2 " + success);
        if (success) {
            System.out.println("3");
            success = runJob2(tmpDir, outDir);
        }
        System.out.println("4 " + success);
        System.exit(success ? 0 : 1);
    }
}