org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid.FeedUrlIdStep.java Source code

Introduction

Here is the source code for org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid.FeedUrlIdStep.java
Source

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;

import com.google.common.collect.MinMaxPriorityQueue;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
 *  @author rana
 */
public class FeedUrlIdStep extends CrawlPipelineStep implements Mapper<TextBytes, TextBytes, TextBytes, TextBytes>,
        Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {

    enum Counters {
        GOT_EXCEPTION_PROCESSING_MAP, GOT_EXCEPTION_PROCESSING_REDUCE, REJECTED_HOST_TOO_MANY_KEYS, REJECTED_NESTED_BLOG_POST_FEED_ITEM, DETECTED_TOO_MANY_HOST_KEYS, REJECTED_EXCEEDED_MAX_COLLAPSED_CANDIDATES, REJECTED_TAG_FEED_ITEM, SKIPPING_BLOG_PLATFORM_URL

    }

    static class URLCandidate implements Comparable<URLCandidate> {

        String[] parts;

        GoogleURL urlObject;

        URLCandidate(GoogleURL urlObject) {
            this.urlObject = urlObject;
            String path = urlObject.getPathAndQuery();
            if (path.charAt(0) == '/') {
                this.parts = path.substring(1).split("/");
            } else {
                this.parts = path.split("/");
            }
        }

        @Override
        public int compareTo(URLCandidate o) {
            int result = parts.length < o.parts.length ? -1 : (parts.length > o.parts.length) ? 1 : 0;
            if (result == 0) {
                for (int i = 0; i < parts.length; ++i) {
                    result = parts[i].compareTo(o.parts[i]);
                    if (result != 0)
                        break;
                }
            }
            return result;
        }

        @Override
        public String toString() {
            return urlObject.getCanonicalURL();
        }
    }

    public static final String OUTPUT_DIR_NAME = "feedURLIdentifier";

    private static final Log LOG = LogFactory.getLog(FeedUrlIdStep.class);

    JsonParser parser = new JsonParser();

    static final int HAS_ATOM_XML = 1 << 0;

    static final int HAS_FEED_XML = 1 << 1;

    JsonObject objectOut = new JsonObject();

    Pattern atomRSSPattern = Pattern.compile(".*(application/atom.xml|application/rss.xml).*");
    Pattern blogPostFeedPattern = Pattern.compile(".*/[0-9]{4}/[0-9]{2}/.*");

    Pattern tagPattern = Pattern.compile(".*/tag/.*");

    Pattern blogPlatformURLPattern = Pattern.compile("http://[^/]*.(blogspot|wordpress|tumblr|typepad).com/.*");
    TreeMap<String, JsonObject> treeMap = new TreeMap<String, JsonObject>();
    static final int MAX_SAMPLES = 100;
    static final int MAX_SAME_LEVEL_SAMPLES = 4;

    private static final int SIMILARITY_THRESHOLD = 7;

    static ArrayList<URLCandidate> collapseCandidates(ArrayList<URLCandidate> candidateList,
            int similarityThreshold) {

        for (int i = 0; i < candidateList.size(); ++i) {
            URLCandidate head = candidateList.get(i);
            if (head.parts.length >= 2 && head.parts[head.parts.length - 1].equals("feed")) {
                boolean endsWithSlash = head.urlObject.getCanonicalURL().endsWith("/");
                int trailingItemIndex = head.parts.length - 1;
                int similar = 0;
                for (int j = i + 1; j < candidateList.size(); ++j) {
                    URLCandidate other = candidateList.get(j);
                    if (other.parts.length == head.parts.length) {
                        if (other.parts[trailingItemIndex].equals("feed")) {
                            if (head.parts.length == 2 || head.parts[trailingItemIndex - 2]
                                    .equals(other.parts[trailingItemIndex - 2])) {
                                similar++;
                            }
                        }
                    }
                }
                if (similar >= similarityThreshold) {
                    for (int k = 0; k < similar; ++k) {
                        candidateList.remove(i + 1);
                    }
                    candidateList.remove(i);
                    StringBuffer finalString = new StringBuffer("/");
                    for (int k = 0; k <= trailingItemIndex - 2; ++k) {
                        finalString.append(head.parts[k]);
                        finalString.append("/");
                    }
                    finalString.append("feed");
                    if (endsWithSlash)
                        finalString.append("/");

                    candidateList.add(new URLCandidate(new GoogleURL(head.urlObject.getScheme() + "://"
                            + head.urlObject.getHost() + finalString.toString())));
                }
            }
        }

        for (int i = 0; i < candidateList.size(); ++i) {
            URLCandidate head = candidateList.get(i);
            String prefix = getPrefixGivenPath(head.urlObject.getPathAndQuery());
            if (prefix.length() != 0) {
                for (int j = i + 1; j < candidateList.size();) {
                    if (candidateList.get(j).urlObject.getPathAndQuery().startsWith(prefix)) {
                        candidateList.remove(j);
                    } else {
                        ++j;
                    }
                }
            }
        }
        return candidateList;
    }

    public static ArrayList<URLCandidate> drainToArrayList(MinMaxPriorityQueue<URLCandidate> queue) {
        int queueSize = queue.size();
        ArrayList<URLCandidate> list = new ArrayList<URLCandidate>(queueSize);
        for (int i = 0; i < queueSize; ++i) {
            list.add(queue.removeFirst());
        }
        return list;
    }

    static String getPrefixGivenPath(String path) {
        int lastIndexOfSlash;
        if (path.lastIndexOf('/') == path.length() - 1) {
            lastIndexOfSlash = path.lastIndexOf('/', path.length() - 2);
        } else {
            lastIndexOfSlash = path.lastIndexOf('/');
        }

        if (lastIndexOfSlash != -1) {
            return path.substring(0, lastIndexOfSlash);
        }
        return path;
    }

    public static void main(String[] args) {
        ArrayList<URLCandidate> candidates = new ArrayList<URLCandidate>();
        candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/info/feed/")));
        candidates.add(
                new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/supernatural-amphitheatre/feed/")));
        candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/tickets-pre-ballot/feed/")));

        collapseCandidates(candidates, 2);

        System.out.println(candidates.toString());

        MinMaxPriorityQueue<URLCandidate> deque2 = MinMaxPriorityQueue.create();
        deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/feed/")));
        deque2.add(new URLCandidate(
                new GoogleURL("http://2010.goldenplains.com.au/blog/supernatural-amphitheatre/feed/")));
        deque2.add(
                new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot/feed/")));
        deque2.add(
                new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/tickets-pre-ballot/feed/")));
        deque2.add(
                new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-2/feed/")));
        deque2.add(
                new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-3/feed/")));
        ArrayList<URLCandidate> test = drainToArrayList(deque2);
        System.out.println(test);

        collapseCandidates(test, 2);

        System.out.println(test.toString());
    }

    MinMaxPriorityQueue<URLCandidate> candidateList = MinMaxPriorityQueue.maximumSize(MAX_SAMPLES).create();
    ArrayList<URLCandidate> lookAhead = new ArrayList<URLCandidate>();

    public FeedUrlIdStep() {
        super(null, null, null);
    }

    public FeedUrlIdStep(CrawlPipelineTask task) {
        super(task, "Feed URL Identifier", OUTPUT_DIR_NAME);
    }

    @Override
    public void close() throws IOException {

    }

    @Override
    public void configure(JobConf job) {

    }

    @Override
    public Log getLogger() {
        return LOG;
    }

    @Override
    public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
            throws IOException {
        try {
            if (blogPlatformURLPattern.matcher(key.toString()).matches()) {
                reporter.incrCounter(Counters.SKIPPING_BLOG_PLATFORM_URL, 1);
            } else {
                if (atomRSSPattern.matcher(value.toString()).matches()) {
                    JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject();
                    JsonObject linkStatus = containerObj.getAsJsonObject("link_status");

                    if (linkStatus != null) {
                        JsonArray typeAndRels = linkStatus.getAsJsonArray("typeAndRels");
                        if (typeAndRels != null) {
                            for (JsonElement e : typeAndRels) {
                                String parts[] = e.getAsString().split(":");
                                if (parts.length == 3 && (parts[1].equalsIgnoreCase("application/atom+xml")
                                        || parts[1].equalsIgnoreCase("application/rss+xml"))) {

                                    if (blogPostFeedPattern.matcher(key.toString()).matches()) {
                                        reporter.incrCounter(Counters.REJECTED_NESTED_BLOG_POST_FEED_ITEM, 1);
                                    } else if (tagPattern.matcher(key.toString()).matches()) {
                                        reporter.incrCounter(Counters.REJECTED_TAG_FEED_ITEM, 1);
                                    } else {
                                        GoogleURL urlObject = new GoogleURL(key.toString());
                                        if (urlObject.isValid()) {

                                            objectOut.addProperty("url", urlObject.getCanonicalURL());
                                            objectOut.addProperty("type", parts[1]);
                                            output.collect(new TextBytes(urlObject.getHost()),
                                                    new TextBytes(objectOut.toString()));

                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        } catch (Exception e) {
            reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_MAP, 1);
        }
    }

    @Override
    public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
            Reporter reporter) throws IOException {

        candidateList.clear();
        lookAhead.clear();

        try {
            while (values.hasNext()) {
                JsonObject object = parser.parse(values.next().toString()).getAsJsonObject();
                String url = object.get("url").getAsString();
                candidateList.add(new URLCandidate(new GoogleURL(url)));
            }

            if (candidateList.size() != 0) {
                if (candidateList.size() == 1) {
                    output.collect(key, new TextBytes(candidateList.peek().urlObject.getCanonicalURL()));
                } else {
                    // pop first candidate ...
                    URLCandidate firstCandidate = candidateList.removeFirst();

                    while (candidateList.peek() != null
                            && candidateList.peek().parts.length == firstCandidate.parts.length) {
                        // ok // reject outright if too many candidates of this length ...
                        lookAhead.add(candidateList.removeFirst());
                    }

                    if (lookAhead.size() > MAX_SAME_LEVEL_SAMPLES) {
                        reporter.incrCounter(Counters.DETECTED_TOO_MANY_HOST_KEYS, 1);
                        return;
                    } else {
                        output.collect(key, new TextBytes(firstCandidate.urlObject.getCanonicalURL()));
                        for (URLCandidate nextCandidate : lookAhead) {
                            output.collect(key, new TextBytes(nextCandidate.urlObject.getCanonicalURL()));
                        }
                    }
                }
            }
        } catch (Exception e) {
            reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_REDUCE, 1);
            LOG.error("Key:" + key + "\n" + StringUtils.stringifyException(e));
        }
    }

    @Override
    public void runStep(Path outputPathLocation) throws IOException {

        JobConf job = new JobBuilder(getDescription(), getConf())

                .inputs(((DomainMetadataTask) getTask()).getMergeDBDataPaths()).inputIsSeqFile()
                .mapper(FeedUrlIdStep.class).reducer(FeedUrlIdStep.class, false).outputIsSeqFile()
                .output(outputPathLocation).outputKeyValue(TextBytes.class, TextBytes.class)
                .numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).build();

        JobClient.runJob(job);
    }

}