org.apache.nutch.scoring.webgraph.Loops.java Source code

Introduction

Here is the source code for org.apache.nutch.scoring.webgraph.Loops.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/**
 * The Loops job identifies cycles of loops inside of the web graph. This is
 * then used in the LinkRank program to remove those links from consideration
 * during link analysis.
 * 
 * This job will identify both reciprocal links and cycles of 2+ links up to a
 * set depth to check. The Loops job is expensive in both computational and
 * space terms. Because it checks outlinks of outlinks of outlinks for cycles
 * its intermediate output can be extremly large even if the end output is
 * rather small. Because of this the Loops job is optional and if it doesn't
 * exist then it won't be factored into the LinkRank program.
 */
public class Loops extends Configured implements Tool {

    public static final Logger LOG = LoggerFactory.getLogger(Loops.class);
    public static final String LOOPS_DIR = "loops";
    public static final String ROUTES_DIR = "routes";

    /**
     * A link path or route looking to identify a link cycle.
     */
    public static class Route implements Writable {

        private String outlinkUrl = null;
        private String lookingFor = null;
        private boolean found = false;

        public Route() {

        }

        public String getOutlinkUrl() {
            return outlinkUrl;
        }

        public void setOutlinkUrl(String outlinkUrl) {
            this.outlinkUrl = outlinkUrl;
        }

        public String getLookingFor() {
            return lookingFor;
        }

        public void setLookingFor(String lookingFor) {
            this.lookingFor = lookingFor;
        }

        public boolean isFound() {
            return found;
        }

        public void setFound(boolean found) {
            this.found = found;
        }

        public void readFields(DataInput in) throws IOException {

            outlinkUrl = Text.readString(in);
            lookingFor = Text.readString(in);
            found = in.readBoolean();
        }

        public void write(DataOutput out) throws IOException {
            Text.writeString(out, outlinkUrl);
            Text.writeString(out, lookingFor);
            out.writeBoolean(found);
        }
    }

    /**
     * A set of loops.
     */
    public static class LoopSet implements Writable {

        private Set<String> loopSet = new HashSet<String>();

        public LoopSet() {

        }

        public Set<String> getLoopSet() {
            return loopSet;
        }

        public void setLoopSet(Set<String> loopSet) {
            this.loopSet = loopSet;
        }

        public void readFields(DataInput in) throws IOException {

            int numNodes = in.readInt();
            loopSet = new HashSet<String>();
            for (int i = 0; i < numNodes; i++) {
                String url = Text.readString(in);
                loopSet.add(url);
            }
        }

        public void write(DataOutput out) throws IOException {

            int numNodes = (loopSet != null ? loopSet.size() : 0);
            out.writeInt(numNodes);
            for (String loop : loopSet) {
                Text.writeString(out, loop);
            }
        }

        public String toString() {
            StringBuilder builder = new StringBuilder();
            for (String loop : loopSet) {
                builder.append(loop + ",");
            }
            return builder.substring(0, builder.length() - 1);
        }
    }

    /**
     * Initializes the Loop routes.
     */
    public static class Initializer extends Configured
            implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, Route> {

        private JobConf conf;

        /**
         * Default constructor.
         */
        public Initializer() {
        }

        /**
         * Configurable constructor.
         */
        public Initializer(Configuration conf) {
            setConf(conf);
        }

        /**
         * Configure the job.
         */
        public void configure(JobConf conf) {
            this.conf = conf;
        }

        /**
         * Wraps values in ObjectWritable.
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(value);
            output.collect(key, objWrite);
        }

        /**
         * Takes any node that has inlinks and sets up a route for all of its
         * outlinks. These routes will then be followed to a maximum depth inside of
         * the Looper job.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, Route> output,
                Reporter reporter) throws IOException {

            String url = key.toString();
            Node node = null;
            List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();

            // collect all outlinks and assign node
            while (values.hasNext()) {
                ObjectWritable objWrite = values.next();
                Object obj = objWrite.get();
                if (obj instanceof LinkDatum) {
                    outlinkList.add((LinkDatum) obj);
                } else if (obj instanceof Node) {
                    node = (Node) obj;
                }
            }

            // has to have inlinks otherwise cycle not possible
            if (node != null) {

                int numInlinks = node.getNumInlinks();
                if (numInlinks > 0) {

                    // initialize and collect a route for every outlink
                    for (LinkDatum datum : outlinkList) {
                        String outlinkUrl = datum.getUrl();
                        Route route = new Route();
                        route.setFound(false);
                        route.setLookingFor(url);
                        route.setOutlinkUrl(outlinkUrl);
                        output.collect(new Text(outlinkUrl), route);
                    }
                }
            }
        }

        public void close() {
        }
    }

    /**
     * Follows a route path looking for the start url of the route. If the start
     * url is found then the route is a cyclical path.
     */
    public static class Looper extends Configured
            implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, Route> {

        private JobConf conf;
        private boolean last = false;

        /**
         * Default constructor.
         */
        public Looper() {
        }

        /**
         * Configurable constructor.
         */
        public Looper(Configuration conf) {
            setConf(conf);
        }

        /**
         * Configure the job.
         */
        public void configure(JobConf conf) {
            this.conf = conf;
            this.last = conf.getBoolean("last", false);
        }

        /**
         * Wrap values in ObjectWritable.
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            Writable cloned = null;
            if (value instanceof LinkDatum) {
                cloned = new Text(((LinkDatum) value).getUrl());
            } else {
                cloned = WritableUtils.clone(value, conf);
            }
            objWrite.set(cloned);
            output.collect(key, objWrite);
        }

        /**
         * Performs a single loop pass looking for loop cycles within routes. If
         * This is not the last loop cycle then url will be mapped for further
         * passes.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, Route> output,
                Reporter reporter) throws IOException {

            List<Route> routeList = new ArrayList<Route>();
            Set<String> outlinkUrls = new LinkedHashSet<String>();
            int numValues = 0;

            // aggregate all routes and outlinks for a given url
            while (values.hasNext()) {
                ObjectWritable next = values.next();
                Object value = next.get();
                if (value instanceof Route) {
                    routeList.add((Route) WritableUtils.clone((Route) value, conf));
                } else if (value instanceof Text) {
                    String outlinkUrl = ((Text) value).toString();
                    if (!outlinkUrls.contains(outlinkUrl)) {
                        outlinkUrls.add(outlinkUrl);
                    }
                }

                // specify progress, could be a lot of routes
                numValues++;
                if (numValues % 100 == 0) {
                    reporter.progress();
                }
            }

            // loop through the route list
            Iterator<Route> routeIt = routeList.listIterator();
            while (routeIt.hasNext()) {

                // removing the route for space concerns, could be a lot of routes
                // if the route is already found, meaning it is a loop just collect it
                // urls with no outlinks that are not found will fall off
                Route route = routeIt.next();
                routeIt.remove();
                if (route.isFound()) {
                    output.collect(key, route);
                } else {

                    // if the route start url is found, set route to found and collect
                    String lookingFor = route.getLookingFor();
                    if (outlinkUrls.contains(lookingFor)) {
                        route.setFound(true);
                        output.collect(key, route);
                    } else if (!last) {

                        // setup for next pass through the loop
                        for (String outlink : outlinkUrls) {
                            output.collect(new Text(outlink), route);
                        }
                    }
                }
            }
        }

        public void close() {
        }
    }

    /**
     * Finishes the Loops job by aggregating and collecting and found routes.
     */
    public static class Finalizer extends Configured
            implements Mapper<Text, Route, Text, Route>, Reducer<Text, Route, Text, LoopSet> {

        private JobConf conf;

        /**
         * Default constructor.
         */
        public Finalizer() {
        }

        /**
         * Configurable constructor.
         */
        public Finalizer(Configuration conf) {
            setConf(conf);
        }

        /**
         * Configures the job.
         */
        public void configure(JobConf conf) {
            this.conf = conf;
        }

        /**
         * Maps out and found routes, those will be the link cycles.
         */
        public void map(Text key, Route value, OutputCollector<Text, Route> output, Reporter reporter)
                throws IOException {

            if (value.isFound()) {
                String lookingFor = value.getLookingFor();
                output.collect(new Text(lookingFor), value);
            }
        }

        /**
         * Aggregates all found routes for a given start url into a loopset and 
         * collects the loopset.
         */
        public void reduce(Text key, Iterator<Route> values, OutputCollector<Text, LoopSet> output,
                Reporter reporter) throws IOException {

            LoopSet loops = new LoopSet();
            while (values.hasNext()) {
                Route route = values.next();
                loops.getLoopSet().add(route.getOutlinkUrl());
            }
            output.collect(key, loops);
        }

        public void close() {
        }
    }

    /**
     * Runs the various loop jobs.
     */
    public void findLoops(Path webGraphDb) throws IOException {

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Loops: starting at " + sdf.format(start));
            LOG.info("Loops: webgraphdb: " + webGraphDb);
        }

        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);
        Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
        Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
        Path routes = new Path(webGraphDb, ROUTES_DIR);
        Path tempRoute = new Path(webGraphDb,
                ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // run the initializer
        JobConf init = new NutchJob(conf);
        init.setJobName("Initializer: " + webGraphDb);
        FileInputFormat.addInputPath(init, outlinkDb);
        FileInputFormat.addInputPath(init, nodeDb);
        init.setInputFormat(SequenceFileInputFormat.class);
        init.setMapperClass(Initializer.class);
        init.setReducerClass(Initializer.class);
        init.setMapOutputKeyClass(Text.class);
        init.setMapOutputValueClass(ObjectWritable.class);
        init.setOutputKeyClass(Text.class);
        init.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(init, tempRoute);
        init.setOutputFormat(SequenceFileOutputFormat.class);

        try {
            LOG.info("Loops: starting initializer");
            JobClient.runJob(init);
            LOG.info("Loops: installing initializer " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished initializer");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }

        // run the loops job for a maxdepth, default 2, which will find a 3 link
        // loop cycle
        int depth = conf.getInt("link.loops.depth", 2);
        for (int i = 0; i < depth; i++) {

            JobConf looper = new NutchJob(conf);
            looper.setJobName("Looper: " + (i + 1) + " of " + depth);
            FileInputFormat.addInputPath(looper, outlinkDb);
            FileInputFormat.addInputPath(looper, routes);
            looper.setInputFormat(SequenceFileInputFormat.class);
            looper.setMapperClass(Looper.class);
            looper.setReducerClass(Looper.class);
            looper.setMapOutputKeyClass(Text.class);
            looper.setMapOutputValueClass(ObjectWritable.class);
            looper.setOutputKeyClass(Text.class);
            looper.setOutputValueClass(Route.class);
            FileOutputFormat.setOutputPath(looper, tempRoute);
            looper.setOutputFormat(SequenceFileOutputFormat.class);
            looper.setBoolean("last", i == (depth - 1));

            try {
                LOG.info("Loops: starting looper");
                JobClient.runJob(looper);
                LOG.info("Loops: installing looper " + routes);
                FSUtils.replace(fs, routes, tempRoute, true);
                LOG.info("Loops: finished looper");
            } catch (IOException e) {
                LOG.error(StringUtils.stringifyException(e));
                throw e;
            }
        }

        // run the finalizer
        JobConf finalizer = new NutchJob(conf);
        finalizer.setJobName("Finalizer: " + webGraphDb);
        FileInputFormat.addInputPath(finalizer, routes);
        finalizer.setInputFormat(SequenceFileInputFormat.class);
        finalizer.setMapperClass(Finalizer.class);
        finalizer.setReducerClass(Finalizer.class);
        finalizer.setMapOutputKeyClass(Text.class);
        finalizer.setMapOutputValueClass(Route.class);
        finalizer.setOutputKeyClass(Text.class);
        finalizer.setOutputValueClass(LoopSet.class);
        FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
        finalizer.setOutputFormat(MapFileOutputFormat.class);

        try {
            LOG.info("Loops: starting finalizer");
            JobClient.runJob(finalizer);
            LOG.info("Loops: finished finalizer");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        long end = System.currentTimeMillis();
        LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new Loops(), args);
        System.exit(res);
    }

    /**
     * Runs the Loops tool.
     */
    public int run(String[] args) throws Exception {

        Options options = new Options();
        Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
                .create("help");
        Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
                .withDescription("the web graph database to use").create("webgraphdb");
        options.addOption(helpOpts);
        options.addOption(webGraphDbOpts);

        CommandLineParser parser = new GnuParser();
        try {

            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("Loops", options);
                return -1;
            }

            String webGraphDb = line.getOptionValue("webgraphdb");
            findLoops(new Path(webGraphDb));
            return 0;
        } catch (Exception e) {
            LOG.error("Loops: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}