edu.umd.cloud9.webgraph.TrecExtractLinks.java Source code

Introduction

Here is the source code for edu.umd.cloud9.webgraph.TrecExtractLinks.java
Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.webgraph;

import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;

/**
 * 
 * @author Nima Asadi
 * @author Fangyue Wang
 * @author metzler
 *
 */

public class TrecExtractLinks extends PowerTool {
    private static final Logger LOG = Logger.getLogger(TrecExtractLinks.class);

    public static class Map extends Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>> {

        public static enum LinkCounter {
            INPUT_DOCS, // number of input documents
            OUTPUT_DOCS, // number of output documents
            INVALID_DOCNO, // number of malformed documents
            INVALID_URL, // number of malformed URLs
            TEXT_TOO_LONG, // number of lines of anchor text that are abnormally
            // long
            PARSER_FAILED
            // number of times the HTML parser fails
        };

        private static String base; // base URL for current document
        private static String baseHost;
        private static int docno; // docno of current document

        private static final Text keyWord = new Text(); // output key for the
        // mappers
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
        // output value for the mappers

        private static DocnoMapping docnoMapping = null;

        private static final Parser parser = new Parser();
        private static final NodeFilter filter = new NodeClassFilter(LinkTag.class);
        private static NodeList list;

        private static boolean includeInternalLinks;

        private static AnchorTextNormalizer normalizer;

        @Override
        public void setup(Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
                throws IOException {
            Configuration conf = context.getConfiguration();

            String docnoMappingClass = conf.get("Cloud9.DocnoMappingClass");
            try {
                docnoMapping = (DocnoMapping) Class.forName(docnoMappingClass).newInstance();
            } catch (Exception e) {
                throw new RuntimeException("Error initializing DocnoMapping class!");
            }

            String docnoMappingFile = conf.get("Cloud9.DocnoMappingFile", null);
            if (docnoMappingFile != null) {
                Path docnoMappingPath = null;
                try {
                    Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
                    if (localFiles != null) {
                        docnoMappingPath = localFiles[0];
                    } else {
                        docnoMappingPath = new Path(conf.get("Cloud9.DocnoMappingFile"));
                    }
                } catch (IOException e) {
                    throw new RuntimeException("Unable to find DocnoMappingFile!");
                }

                try {
                    docnoMapping.loadMapping(docnoMappingPath, FileSystem.getLocal(conf));
                } catch (Exception e) {
                    e.printStackTrace();
                    throw new RuntimeException("Error initializing DocnoMapping!");
                }
            }

            includeInternalLinks = conf.getBoolean("Cloud9.IncludeInternalLinks", false);

            try {
                normalizer = (AnchorTextNormalizer) Class.forName(conf.get("Cloud9.AnchorTextNormalizer"))
                        .newInstance();
            } catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing AnchorTextNormalizer");
            }
        }

        @Override
        public void map(LongWritable key, WebDocument doc,
                Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
                throws IOException, InterruptedException {
            context.getCounter(LinkCounter.INPUT_DOCS).increment(1);

            try {
                docno = docnoMapping.getDocno(doc.getDocid());
            } catch (NullPointerException e) {
                // Discard documents with an invalid document number
                context.getCounter(LinkCounter.INVALID_DOCNO).increment(1);
                return;
            }

            try {
                String url = doc.getURL().split("\n")[0];
                LOG.info("URI: " + url);
                base = normalizeURL(url);
            } catch (Exception e) {
                // Discard documents with which there is no URL associated
                context.getCounter(LinkCounter.INVALID_URL).increment(1);
                return;
            }

            if (base == null) {
                context.getCounter(LinkCounter.INVALID_URL).increment(1);
                return;
            }

            arrayList.clear();
            arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, AnchorTextConstants.EMPTY_STRING,
                    docno));
            keyWord.set(base);
            context.write(keyWord, arrayList);

            // keeping track of the number of documents that have actually been
            // processed
            context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);

            try {
                baseHost = new URI(base).getHost();
            } catch (Exception e) {
                context.getCounter(LinkCounter.INVALID_URL).increment(1);
                return;
            }

            if (baseHost == null) {
                context.getCounter(LinkCounter.INVALID_URL).increment(1);
                return;
            }

            try {
                parser.setInputHTML(doc.getContent()); // initializing the
                // parser with new HTML
                // content

                // Setting base URL for the current document
                NodeList nl = parser.parse(null);
                BaseHrefTag baseTag = new BaseHrefTag();
                baseTag.setBaseUrl(base);
                nl.add(baseTag);

                // re-initializing the parser with the fixed content
                parser.setInputHTML(nl.toHtml());

                // listing all LinkTag nodes
                list = parser.extractAllNodesThatMatch(filter);
            } catch (ParserException e) {
                context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
                return;
            } catch (StackOverflowError e) {
                context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
                return;
            }

            for (int i = 0; i < list.size(); i++) {
                LinkTag link = (LinkTag) list.elementAt(i);
                String anchor = link.getLinkText();
                String url = normalizeURL(link.extractLink());

                if (url == null) {
                    continue;
                }

                if (url.equals(base)) { // discard self links
                    continue;
                }

                String host = null;
                try {
                    host = new URI(url).getHost();
                } catch (Exception e) {
                    continue;
                }

                if (host == null) {
                    continue;
                }

                if (anchor == null) {
                    anchor = "";
                }

                // normalizing the anchor text
                anchor = normalizer.process(anchor);

                arrayList.clear();
                if (baseHost.equals(host)) {

                    if (!includeInternalLinks)
                        continue;

                    arrayList.add(new AnchorText(AnchorTextConstants.Type.INTERNAL_IN_LINK.val, anchor, docno));

                } else {
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, anchor, docno));
                }

                try {
                    keyWord.set(url);
                    context.write(keyWord, arrayList);
                } catch (UTFDataFormatException e) {
                    context.getCounter(LinkCounter.TEXT_TOO_LONG).increment(1);

                    keyWord.set(url);
                    byte flag = arrayList.get(0).getType();
                    arrayList.clear();
                    arrayList.add(new AnchorText(flag, AnchorTextConstants.EMPTY_STRING, docno));
                    context.write(keyWord, arrayList);
                }

            }
        }

        private static String normalizeURL(String url) {
            try {
                URI uri = new URI(url).normalize(); // first apply built-in normalizer
                String scheme = uri.getScheme().toLowerCase(); // schemes are not case sensitive
                String host = uri.getHost().toLowerCase(); // hosts are not case sensitive
                String path = uri.getPath();
                while (path != null && path.length() > 0 && path.charAt(path.length() - 1) == '/') { // remove trailing forward slashes from path
                    path = path.substring(0, path.length() - 1);
                }
                return (new URI(scheme, host, path, null)).toString();
            } catch (Exception e) {
                return null;
            }
        }
    }

    public static class Reduce
            extends Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>> {

        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
        private static boolean pushed;

        @Override
        public void reduce(Text key, Iterable<ArrayListWritable<AnchorText>> values,
                Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>.Context context)
                throws IOException, InterruptedException {

            arrayList.clear();

            for (ArrayListWritable<AnchorText> packet : values) {
                for (AnchorText data : packet) {

                    pushed = false;

                    for (int i = 0; i < arrayList.size(); i++) {
                        if (arrayList.get(i).equalsIgnoreSources(data)) {
                            arrayList.get(i).addDocumentsFrom(data);
                            pushed = true;
                            break;
                        }
                    }

                    if (!pushed)
                        arrayList.add(data.clone());
                }
            }

            context.write(key, arrayList);
        }
    }

    public static final String[] RequiredParameters = { "Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers",
            "Cloud9.Reducers", "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer",
            "Cloud9.DocnoMappingClass", "Cloud9.DocnoMappingFile" };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public TrecExtractLinks(Configuration conf) {
        super(conf);
    }

    CollectionConfigurationManager configer;

    public TrecExtractLinks(Configuration conf, CollectionConfigurationManager confer) {
        super(conf);
        this.configer = confer;
    }

    @Override
    public int runTool() throws Exception {

        Configuration conf = getConf();
        conf.set("mapred.child.java.opts", "-Xmx3072m");
        conf.setInt("mapred.task.timeout", 60000000);
        Job job = new Job(conf);

        int numReducers = conf.getInt("Cloud9.Reducers", 200);

        String inputPath = conf.get("Cloud9.InputPath");
        String outputPath = conf.get("Cloud9.OutputPath");

        String mappingFile = conf.get("Cloud9.DocnoMappingFile");

        FileSystem fs = FileSystem.get(conf);
        if (!fs.exists(new Path(mappingFile))) {
            throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
        }

        DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration());

        job.setJobName("ExtractLinks");
        job.setNumReduceTasks(numReducers);

        job.setJarByClass(TrecExtractLinks.class);
        job.setMapperClass(TrecExtractLinks.Map.class);
        job.setCombinerClass(TrecExtractLinks.Reduce.class);
        job.setReducerClass(TrecExtractLinks.Reduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);

        configer.applyJobConfig(job);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        recursivelyAddInputPaths(job, inputPath);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        LOG.info("ExtractLinks");
        LOG.info(" - input path: " + inputPath);
        LOG.info(" - output path: " + outputPath);
        LOG.info(" - mapping file: " + mappingFile);
        LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
        FileSystem fs;
        try {
            fs = FileSystem.get(new URI(path), job.getConfiguration());
        } catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + path);
        }

        FileStatus[] ls = fs.listStatus(new Path(path));
        for (FileStatus status : ls) {
            // skip anything that starts with an underscore, as it often indicates
            // a log directory or another special type of Hadoop file
            if (status.getPath().getName().startsWith("_")) {
                continue;
            }

            if (status.isDir()) {
                recursivelyAddInputPaths(job, status.getPath().toString());
            } else {
                FileInputFormat.addInputPath(job, status.getPath());
            }
        }
    }

}