edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java Source code

Introduction

Here is the source code for edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 *  
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package edu.umd.cloud9.collection.wikipedia;

import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfStringInt;

/**
 * Tool for extracting the weighted link graph out of the Wikipedia corpus.
 * Each node in the graph represents a Wikipedia article, each edge is a link
 * between two articles, and the weight of the link correspond to the number
 * of times the destination article appears in the content of the source 
 * article.
 * <br>
 * 
 * The graph is represented as adjacency lists, each single line
 * contains a source node Id, list of its outgoing links with the weights. 
 * <br>
 * One sample invocation of this tool:<br>
 * <code>
 * hadoop jar cloud9-[version].jar edu...BuildWikipediaWeightedLinkGraph 
 * -input [wikipedia XML Dump file] -reduceNo [reducer count] -phase 3
 * </code>
 * 
 * @since 26 Feb 2014: Add JsonLongDoubleFloatDoubleVertexInputFormat support
 * that will be used to interface with Apache Giraph algorithms
 * 
 * @author Tuan
 * @version 0.2
 * @since 13 May 2012
 *
 */
public class BuildWikipediaWeightedLinkGraph extends Configured implements Tool {

    private static final Logger log = Logger.getLogger(BuildWikipediaWeightedLinkGraph.class);

    private static final String LANG_OPTION = "lang";
    private static final String INPUT_OPTION = "input";
    private static final String REDUCE_NO = "reduce";
    private static final String PHASE = "phase";

    // Output format: text --> TextOutputFormat
    // giraphjson --> VertexWithDoubleValueDoubleEdgeTextOutputFormat
    // TODO: 26.02.2014 13:35 implement the interfacing between Cloud9 and Giraph
    private static final String OUTPUT_FORMAT_OPTION = "outf";

    /** 
     * Parse each Wikipedia article and emit the tuples (outgoingLinks, no),
     * where outgoingLinks is the title of one destination article, and no is
     * the number of times the destination article is linked from the source
     * article.
     * 
     * The output is of the form (String1, <String2, int>), where keys are the
     * title of the destination node, and the content of values vary:
     * - IF String1 = String2, then int will be the pageId
     * - IF String1 <> String2 and int is -1, then String1 is a redirect page
     * and every other incoming links should be redirected to the page String2
     * - IF String1 <> String2 and int is positive, then every other incoming
     * links should be updated with the page id of String1
     *  
     * We apply some "quick-and-dirty" hacks to pass the Wikipedia article
     * info along with other link messages:
     * -
     */
    private static class LinkEmitMapClass extends Mapper<LongWritable, WikipediaPage, Text, PairOfStringInt> {

        private Text newKey = new Text();
        private PairOfStringInt pair = new PairOfStringInt();
        private Object2IntOpenHashMap<String> map = new Object2IntOpenHashMap<String>();

        @Override
        protected void map(LongWritable key, WikipediaPage p, Context context)
                throws IOException, InterruptedException {

            // only articles are emitted
            boolean redirected = false;
            if (p.isRedirect()) {
                redirected = true;
            } else if (!p.isArticle())
                return;
            map.clear();
            String title = p.getTitle().trim();

            // to make the title case-sensitive, we will change all lower-cased
            // first characters to upper-case.
            if (title.isEmpty())
                return;
            String fc = title.substring(0, 1);
            if (fc.matches("[a-z]")) {
                title = title.replaceFirst(fc, fc.toUpperCase());
            }

            // do not pass the structure message of a redirect article
            if (!redirected) {
                newKey.set(title);
                int id = Integer.parseInt(p.getDocid());
                pair.set(title, id);
                context.write(newKey, pair);
            }

            for (String t : extractLinkDestinations(p)) {
                t = t.trim();
                if (t.isEmpty())
                    continue;
                fc = t.substring(0, 1);
                if (fc.matches("[a-z]")) {
                    t = t.replaceFirst(fc, fc.toUpperCase());
                }
                if (title.equals(t))
                    continue;
                if (redirected) {
                    newKey.set(title);
                    pair.set(t, -1);
                    context.write(newKey, pair);
                    return;
                } else {
                    if (!map.containsKey(t)) {
                        map.put(t, 1);
                    } else {
                        int v = map.getInt(t);
                        map.put(t, v + 1);
                    }
                }
            }
            String[] keys = map.keySet().toArray((new String[map.size()]));

            for (String k : keys) {
                if (k.isEmpty())
                    continue;
                newKey.set(k);
                int cnt = map.get(k);
                pair.set(title, cnt);
                context.write(newKey, pair);
            }
        }
    }

    /**
     * aggregate all incoming links for a particular article, detect redirect
     * links and update them with actual article ids. Links will not be emitted
     * further if the node is not an actual article (redirect, stub, ....)
     * */
    private static class RedirectResolveReduceClass extends Reducer<Text, PairOfStringInt, Text, PairOfStringInt> {

        private PairOfStringInt newPair = new PairOfStringInt();

        @Override
        protected void reduce(Text key, Iterable<PairOfStringInt> values, Context context)
                throws IOException, InterruptedException {

            // the sentinel indicating whether we encounter the structure or
            // redirect message along the iterator
            Text newKey = null;
            boolean redirected = false;
            List<PairOfStringInt> cache = new ArrayList<PairOfStringInt>();
            PairOfStringInt tmpItem;

            // internal counter for debugging
            int v, linkCnt = 0, totalCnt = 0;

            // a sample page to trace ill-formed articles
            String k, tmpPage = null;

            log.info("Processing page: " + key.toString());
            for (PairOfStringInt pair : values) {
                k = pair.getKey();
                v = pair.getValue();
                totalCnt++;
                // look for redirect message first
                if (v == -1) {
                    newKey = new Text(k);
                    redirected = true;
                    log.info("redirect message: (" + newKey + ", <" + k + ", " + v + ">)");
                }

                // then look for structure message
                else if (key.toString().equals(k)) {

                    // there can be some redirect articles that redirect to 
                    // themselves. Those articles should be ignored
                    if (!redirected) {
                        newKey = key;
                        newPair.set(k, v);
                        context.write(newKey, newPair);
                        log.info("structure message: (" + key + ", <" + k + ", " + v + ">)");
                    } else
                        return;
                }

                // items before the redirect or structure messages in 
                // the iterator will be copied and be emitted later
                else if (newKey == null) {
                    tmpItem = new PairOfStringInt(k, v);
                    cache.add(tmpItem);

                    // pick one article to debug in case the key is indeed an 
                    // ill-formed article
                    if (tmpPage == null) {
                        tmpPage = k;
                    }
                }

                // items after the redirect or structure messages will be
                // emitted right away. Boundary (possibly never happened)
                // case: When page p1 is a redirect to p2, and p2 has links
                // to p1, then the link should be ignored
                else if (!newKey.toString().equals(k)) {
                    linkCnt++;
                    newPair.set(k, v);
                    context.write(newKey, newPair);
                } else {
                    log.warn("Weird! " + k + " links to its redirect.");
                }
            }

            // The ill-formed link
            if (newKey == null) {
                log.info("Ill-formed link: " + key + ", found in " + tmpPage);
                return;
            }

            // second run: update the remaining links with actual destination
            for (PairOfStringInt pair : cache) {
                linkCnt++;
                context.write(newKey, pair);
            }
            log.info("Page: " + key + ". Total count: " + totalCnt + ", passed link count: " + linkCnt);
        }
    }

    private static class DestinationIdResolveReduceClass
            extends Reducer<Text, PairOfStringInt, Text, PairOfStringInt> {

        Text newKey = new Text();
        PairOfStringInt newValue = new PairOfStringInt();

        @Override
        protected void reduce(Text key, Iterable<PairOfStringInt> values, Context context)
                throws IOException, InterruptedException {
            String k, id = null;
            int v;
            List<PairOfStringInt> cache = new ArrayList<PairOfStringInt>();
            PairOfStringInt tmpItem;
            for (PairOfStringInt pair : values) {
                k = pair.getKey();
                v = pair.getValue();
                newKey.set(k);

                // find the structure message
                if (key.toString().equals(k)) {
                    id = String.valueOf(v);
                    newValue.set(k, v);
                    context.write(newKey, newValue);
                }

                // messages after the structure message will be emitted 
                // immediately
                else if (id != null) {
                    newValue.set(id, v);
                    context.write(newKey, newValue);
                }

                // messages before the structure message will be copied
                // and emitted later
                else {
                    tmpItem = new PairOfStringInt(k, v);
                    cache.add(tmpItem);
                }
            }
            if (id == null) {
                log.warn("Still found ill-formed article: " + key);
            } else
                for (PairOfStringInt pair : cache) {
                    k = pair.getKey();
                    newKey.set(k);
                    newValue.set(id, pair.getValue());
                    context.write(newKey, newValue);
                }
        }
    }

    private static class SourceIdResolveReduceClass extends Reducer<Text, PairOfStringInt, Text, Text> {

        Text newKey = new Text();
        Text newValue = new Text();
        Object2IntOpenHashMap<String> map = new Object2IntOpenHashMap<String>();

        @Override
        protected void reduce(Text key, Iterable<PairOfStringInt> values, Context context)
                throws IOException, InterruptedException {
            map.clear();
            String k, id = null;
            int v;
            for (PairOfStringInt pair : values) {
                k = pair.getKey();
                v = pair.getValue();

                // find structure message, and accumulate the others
                if (key.toString().equals(k)) {
                    id = String.valueOf(v);
                } else {
                    if (!map.containsKey(k))
                        map.put(k, v);
                    else {
                        int value = map.getInt(k);
                        map.put(k, value + v);
                    }
                }
            }
            if (id == null) {
                log.warn("A page having no structure message: " + key);
            } else {
                newKey.set(id);
                String[] keys = map.keySet().toArray(new String[map.size()]);
                for (String dest : keys) {
                    newValue.set("\t" + dest + "\t" + map.get(dest));
                    context.write(newKey, newValue);
                }
            }
        }
    }

    private static List<String> extractLinkDestinations(WikipediaPage wikiPage) {
        String page = wikiPage.getRawXML();
        int start = 0;
        List<String> links = new ArrayList<String>();

        while (true) {
            start = page.indexOf("[[", start);

            if (start < 0)
                break;

            int end = page.indexOf("]]", start);

            if (end < 0)
                break;

            String text = page.substring(start + 2, end);

            // skip empty links
            if (text.length() == 0) {
                start = end + 1;
                continue;
            }

            // skip special links
            if (text.indexOf(":") != -1) {
                start = end + 1;
                continue;
            }

            // if there is anchor text, get only article title
            int a;
            if ((a = text.indexOf("|")) != -1) {
                text = text.substring(0, a);
            }

            if ((a = text.indexOf("#")) != -1) {
                text = text.substring(0, a);
            }

            // ignore article-internal links, e.g., [[#section|here]]
            if (text.length() == 0) {
                start = end + 1;
                continue;
            }

            links.add(text.trim());

            start = end + 1;
        }

        return links;
    }

    private String phase1(String inputPath, int reduceNo, String lang)
            throws IOException, InterruptedException, ClassNotFoundException {

        String output = "tmp/wiki-link/phase1";

        Job job = Job.getInstance(getConf());
        job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1");
        job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

        job.setNumReduceTasks(reduceNo);

        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(output));

        if ("en".equals(lang)) {
            job.setInputFormatClass(WikipediaPageInputFormat.class);
        } else
            throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported ");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PairOfStringInt.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PairOfStringInt.class);

        job.setMapperClass(LinkEmitMapClass.class);
        job.setReducerClass(RedirectResolveReduceClass.class);

        job.waitForCompletion(true);

        return output;
    }

    private String phase2(String inputPath, int reduceNo)
            throws IOException, InterruptedException, ClassNotFoundException {

        String output = "tmp/wiki-link/phase2";

        Job job = Job.getInstance(getConf());
        job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2");
        job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

        job.setNumReduceTasks(reduceNo);

        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PairOfStringInt.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PairOfStringInt.class);

        job.setReducerClass(DestinationIdResolveReduceClass.class);

        job.waitForCompletion(true);

        return output;
    }

    private String phase3(String inputPath, int reduceNo)
            throws IOException, InterruptedException, ClassNotFoundException {

        String output = "trace/phase3";

        Job job = Job.getInstance(getConf());
        job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3");
        job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

        job.setNumReduceTasks(reduceNo);

        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PairOfStringInt.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setReducerClass(SourceIdResolveReduceClass.class);

        job.waitForCompletion(true);

        return output;
    }

    @SuppressWarnings("static-access")
    public int run(String[] args) throws Exception {
        Options opts = new Options();

        Option langOpt = OptionBuilder.withArgName("lang").hasArg()
                .withDescription("language of the Wikipedia dump file").create(LANG_OPTION);

        Option inputOpt = OptionBuilder.withArgName("input-path").hasArg().withDescription("XML dump file path")
                .create(INPUT_OPTION);

        Option reduceOpt = OptionBuilder.withArgName("reduce-no").hasArg().withDescription("numer of reducer nodes")
                .create(REDUCE_NO);

        Option phaseOpt = OptionBuilder.withArgName("phase-no").hasArg().withDescription("numer of reducer nodes")
                .create(PHASE);

        opts.addOption(langOpt);
        opts.addOption(inputOpt);
        opts.addOption(reduceOpt);
        opts.addOption(phaseOpt);

        CommandLine cl;
        CommandLineParser parser = new GnuParser();
        try {
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return -1;
        }
        if (!cl.hasOption(INPUT_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(getClass().getName(), opts);
            ToolRunner.printGenericCommandUsage(System.out);
            return -1;
        }
        int reduceNo = 1;
        if (cl.hasOption(REDUCE_NO)) {
            try {
                reduceNo = Integer.parseInt(cl.getOptionValue(REDUCE_NO));
            } catch (NumberFormatException e) {
                System.err.println("Error parsing reducer number: " + e.getMessage());
            }
        }
        int phase = 1;
        if (cl.hasOption(PHASE)) {
            try {
                phase = Integer.parseInt(cl.getOptionValue(PHASE));
            } catch (NumberFormatException e) {
                System.err.println("Error parsing phase number: " + e.getMessage());
            }
        }
        String lang = "en";
        if (cl.hasOption(LANG_OPTION)) {
            lang = cl.getOptionValue(LANG_OPTION);
        }

        String input = cl.getOptionValue(INPUT_OPTION);
        if (phase == 1) {
            phase1(input, reduceNo, lang);
        } else if (phase == 2) {
            String output = phase1(input, reduceNo, lang);
            phase2(output, reduceNo);
        } else if (phase == 3) {
            String path = phase1(input, reduceNo, lang);
            path = phase2(path, reduceNo);
            path = phase3(path, reduceNo);
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new BuildWikipediaWeightedLinkGraph(), args);
    }
}