nl.utwente.mirex.AnchorExtract.java Source code

Introduction

Here is the source code for nl.utwente.mirex.AnchorExtract.java
Source

/*
 * Copyright Notice:
 * -----------------
 *
 * The contents of this file are subject to the PfTijah Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://dbappl.cs.utwente.nl/Legal/PfTijah-1.1.html
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * The Original Code is the Mirex system.
 * 
 * The Initial Developer of the Original Code is the "University of Twente".
 * Portions created by the "University of Twente" are
 * Copyright (C) 2010 "University of Twente".
 * All Rights Reserved.
 */

package nl.utwente.mirex;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import edu.cmu.lemurproject.WarcFileInputFormat;
import edu.cmu.lemurproject.WarcRecord;
import edu.cmu.lemurproject.WritableWarcRecord;

/**
 * <b>Runs MapReduce job:</b> Extracts anchor text from HTML documents. 
 * The input path should contain files (or should be a file) on 
 * the Hadoop file system formatted as Web Archive (WARC) files.
 * The output consists of gzipped, tab separated files containing: 
 * <i>WARC-TREC-ID, URL, anchor text1, anchor text 2, </i>
 * etc. Only finds anchors for documents inside the collection.
 * Documents in the collection without inlinks are not listed.
 * Anchor texts are cut after more than 10MB of anchors have been 
 * collected for one page to keep the output manageable.
 * This MapReduce program is described in: 
 * <blockquote>
 *   Djoerd Hiemstra and Claudia Hauff. 
 *   "MIREX: MapReduce Information Retrieval Experiments" 
 *   Technical Report TR-CTIT-10-15, Centre for Telematics 
 *   and Information Technology, University of Twente, 
 *   ISSN 1381-3625, 2010
 * </blockquote>
 * @author Djoerd Hiemstra
 * @author Guido van der Zanden
 * @since 0.1
 */
public class AnchorExtract {

    private final static String MirexId = "MIREX-TREC-ID: ";
    private final static Pattern mirexIdPat = Pattern.compile(MirexId + "(.+)$");
    private final static int maxCapacity = 10000000; // not more than 10 MB anchors per url gathered
    private final static int maxHtml = 50000; // not more than 50 KB used per web page

    /**
     * -- Mapper: Extracts anchors. 
     */
    public static class Map extends Mapper<LongWritable, WritableWarcRecord, Text, Text> {

        private final static Pattern scriptPat = Pattern.compile("<script(.*?)</script>",
                Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL),
                anchorPat = Pattern.compile("<a ([^>]*)href=[\"']?([^> '\"]+)([^>]*)>(.*?)</a>",
                        Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL),
                relUrlPat = Pattern.compile("^/"), absUrlPat = Pattern.compile("^[a-z]+://"),
                nofollowPat = Pattern.compile("rel=[\"']?nofollow", Pattern.CASE_INSENSITIVE); // ignore links with rel="nofollow"
        private final static String noIndexHTML = "/$|/index\\.[a-z][a-z][a-z][a-z]?$";

        private static String makeAbsoluteUrl(String targetUrl, String relativeUrl) {
            /* takes url of web page (targetUrl) and relative url to make absolute url */
            // assert !targetUrl.equals("");
            String absUrl;
            targetUrl = absUrlPat.matcher(targetUrl).replaceAll(""); // remove protocol header
            relativeUrl = relativeUrl.replaceAll("[ \n\r\t]", "");
            Matcher matcher = relUrlPat.matcher(relativeUrl);
            if (matcher.find())
                absUrl = targetUrl.replaceAll("/.*$", "") + relativeUrl;
            else {
                matcher = absUrlPat.matcher(relativeUrl);
                if (matcher.find())
                    absUrl = matcher.replaceAll("");
                else
                    absUrl = targetUrl.replaceAll("/[^/]+$", "") + '/' + relativeUrl;
            }
            return "http://" + absUrl.replaceAll("/.[^/]+/\\.\\./|//", "/").replaceFirst(noIndexHTML, "");
        }

        /**
         * @param key any integer
         * @param value the web page
         * @param output (URL, anchor text <i>or</i> TREC-ID)
         */
        public void map(LongWritable key, WritableWarcRecord value, Context context)
                throws IOException, InterruptedException {
            String baseUri, trecId, content;
            Text link = new Text(), anchor = new Text();
            Matcher matcher;
            WarcRecord thisRecord = value.getRecord();
            if (thisRecord.getHeaderRecordType().equals("response")) {
                baseUri = thisRecord.getHeaderMetadataItem("WARC-Target-URI").replaceFirst(noIndexHTML, "");
                trecId = thisRecord.getHeaderMetadataItem("WARC-TREC-ID");
                link.set(baseUri);
                anchor.set(MirexId + trecId);
                context.write(link, anchor); // we want to keep track of the TREC-IDs
                content = thisRecord.getContentUTF8();
                if (content.length() > maxHtml)
                    content = content.substring(0, maxHtml); // truncate websites 
                content = scriptPat.matcher(content).replaceAll(" ");
                matcher = anchorPat.matcher(content);
                while (matcher.find()) {
                    Matcher nomatch = nofollowPat.matcher(matcher.group(1) + matcher.group(3));
                    if (!nomatch.find()) {
                        link.set(makeAbsoluteUrl(baseUri, matcher.group(2)));
                        anchor.set(matcher.group(4).replaceAll("<[^>]+>|[ \n\t\r]+", " "));
                        context.write(link, anchor);
                    }
                }
            }
        }
    }

    /**
     * -- Combiner: Glues local anchor texts together.
     */
    public static class Combine extends Reducer<Text, Text, Text, Text> {

        /**
         * @param key URL
         * @param values anchor text <i>or</i> TREC-ID
         * @param output (URL, anchor texts <i>or</i> TREC-ID)</i>
         */
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws InterruptedException, IOException {
            boolean first = true;
            //String trecId = "";
            StringBuilder anchors = new StringBuilder();
            for (Text value : values) {
                String anchor = value.toString();
                Matcher matcher = mirexIdPat.matcher(anchor);
                if (matcher.find()) {
                    context.write(key, new Text(anchor));
                } else {
                    if (anchors.length() < maxCapacity) {
                        if (first) {
                            anchors.append(anchor);
                            first = false;
                        } else {
                            anchors.append("\t").append(anchor);
                        }
                    }
                }
            }
            if (!first) {
                context.write(key, new Text(anchors.toString()));
            }
        }

    }

    /**
     * -- Reducer: Glues anchor texts together, and recovers TREC-ID.
     */
    public static class Reduce extends Reducer<Text, Text, Text, Text> {

        /**
         * @param key URL
         * @param values anchor text <i>or</i> TREC-ID
         * @param output (TREC-ID, URL, anchor texts)</i>
         */
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws InterruptedException, IOException {

            boolean found = false;
            String trecId = "";
            StringBuilder anchors = new StringBuilder();
            anchors.append(key.toString());

            for (Text value : values) {
                String anchor = value.toString();
                Matcher matcher = mirexIdPat.matcher(anchor);
                if (matcher.find()) {
                    trecId = matcher.group(1);
                } else if (anchors.length() < maxCapacity) {
                    anchors.append("\t").append(anchor);
                    found = true;
                }
            }
            if (found && trecId != "") {
                context.write(new Text(trecId), new Text(anchors.toString()));
                if (anchors.length() >= maxCapacity) {
                    System.err.println("Warning: Maximum capacity reached for: " + trecId);
                }
            }
        }
    }

    /**
     * Runs the MapReduce job "anchor text extraction"
     * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts
     * @usage. 
     * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/&#x2a;/ /user/hadoop/ClueWeb09_Anchors </code> 
     */
    public static void main(String[] args) throws Exception {
        // Set job configuration
        Configuration conf = new Configuration();
        conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout
        Job job = new Job(conf, "AnchorExtract");
        job.setJarByClass(AnchorExtract.class);

        if (args.length != 2) {
            System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName());
            System.out.println("          inputFiles: path to data");
            System.out.println("          outputFile: directory where anchor text is stored");
            System.exit(1);
        }
        int argc = 0;
        String inputFiles = args[argc++];
        String outputFile = args[argc++];

        job.setMapperClass(Map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setCombinerClass(Combine.class);

        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(WarcFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list.
        FileOutputFormat.setOutputPath(job, new Path(outputFile));
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

        job.waitForCompletion(true);
    }
}