org.commoncrawl.mapred.pipelineV3.crawllistgen.PartitionUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.mapred.pipelineV3.crawllistgen.PartitionUtils.java

Source

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.util.Set;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.commoncrawl.util.ByteArrayUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.Sets;

/**
 * 
 * @author rana
 *
 */
public class PartitionUtils {

    public static class PartitionKeyPartitioner implements Partitioner<TextBytes, Writable> {

        FlexBuffer scratchBuffer = new FlexBuffer();

        @Override
        public void configure(JobConf job) {

        }

        @Override
        public int getPartition(TextBytes key, Writable value, int numPartitions) {
            return getPartitionGivenPartitionKey(key, scratchBuffer, numPartitions);
        }

    }

    static byte pattern[] = { ':' };

    public static void generatePartitionKeyGivenDomain(Set<Long> superDomainIdList, String rootDomain, int type,
            TextBytes partitionKeyOut) {
        partitionKeyOut.set(rootDomain + ":" + Integer.toString(type) + ":");
    }

    public static boolean generatePartitionKeyGivenURL(Set<Long> superDomainIdList, GoogleURL urlObject, int type,
            TextBytes partitionKeyOut) {
        String domain = urlObject.getHost();
        String rootDomain = URLUtils.extractRootDomainName(domain);
        if (rootDomain != null) {
            long domainFP = SuperDomainList.domainFingerprintGivenName(rootDomain);
            if (!superDomainIdList.contains(domainFP)) {
                domain = rootDomain;
            }
            partitionKeyOut.set(domain + ":" + Integer.toString(type) + ":" + urlObject.getCanonicalURL());
            return true;
        }
        return false;
    }

    public static boolean generatePartitionKeyGivenURL(Set<Long> superDomainIdList, TextBytes urlKey, int type,
            TextBytes partitionKeyOut) {
        GoogleURL urlObject = new GoogleURL(urlKey.toString());
        if (urlObject.isValid()) {
            return generatePartitionKeyGivenURL(superDomainIdList, urlObject, type, partitionKeyOut);
        }
        return false;
    }

    public static TextBytes getDomainGivenPartitionKey(TextBytes partitionKey, TextBytes domainOut) {
        int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(),
                partitionKey.getLength(), pattern);
        domainOut.set(partitionKey.getBytes(), partitionKey.getOffset(), index - partitionKey.getOffset());
        return domainOut;
    }

    public static int getPartitionGivenPartitionKey(TextBytes partitionKey, FlexBuffer scratchBuffer,
            int numParitions) {
        int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(),
                partitionKey.getLength(), pattern);
        scratchBuffer.set(partitionKey.getBytes(), partitionKey.getOffset(), index - partitionKey.getOffset());
        return (scratchBuffer.hashCode() & Integer.MAX_VALUE) % numParitions;
    }

    public static int getTypeGivenPartitionKey(TextBytes partitionKey) {
        byte bytes[] = partitionKey.getBytes();
        int offset = partitionKey.getOffset();
        int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(),
                partitionKey.getLength(), pattern);
        int startIndex = ++index;
        while (bytes[index + offset] != ':')
            ++index;
        return (int) ByteArrayUtils.parseLong(bytes, offset + startIndex, index - startIndex, 10);
    }

    public static void getURLGivenPartitionKey(TextBytes partitionKey, TextBytes urlOut) {
        byte bytes[] = partitionKey.getBytes();
        int offset = partitionKey.getOffset();
        int index = ByteArrayUtils.indexOf(partitionKey.getBytes(), partitionKey.getOffset(),
                partitionKey.getLength(), pattern);
        ++index;
        while (bytes[index + offset] != ':')
            ++index;

        if (index + 1 < partitionKey.getLength()) {
            urlOut.set(partitionKey.getBytes(), partitionKey.getOffset() + index + 1,
                    partitionKey.getLength() - (index + 1));
        } else {
            urlOut.clear();
        }
    }

    public static void main(String[] args) {

        TextBytes partitionKeyOut = new TextBytes();
        Set<Long> emptySet = Sets.newHashSet();
        FlexBuffer scratchBuffer = new FlexBuffer();
        TextBytes urlOut = new TextBytes();
        TextBytes domainBytes = new TextBytes();

        generatePartitionKeyGivenURL(emptySet, new TextBytes("http://www.google.com/someurl"), 0, partitionKeyOut);
        System.out.println("ParitiionKey:" + partitionKeyOut.toString());
        System.out.println("Parition:" + getPartitionGivenPartitionKey(partitionKeyOut, scratchBuffer, 10));
        System.out.println("Domain:" + getDomainGivenPartitionKey(partitionKeyOut, domainBytes));
        System.out.println("Type:" + getTypeGivenPartitionKey(partitionKeyOut));
        getURLGivenPartitionKey(partitionKeyOut, urlOut);
        System.out.println("URL:" + urlOut.toString());

        generatePartitionKeyGivenDomain(emptySet, "google.com", 0, partitionKeyOut);
        System.out.println("ParitiionKey:" + partitionKeyOut.toString());
        System.out.println("Parition:" + getPartitionGivenPartitionKey(partitionKeyOut, scratchBuffer, 10));
        System.out.println("Domain:" + getDomainGivenPartitionKey(partitionKeyOut, domainBytes));
        System.out.println("Type:" + getTypeGivenPartitionKey(partitionKeyOut));
        getURLGivenPartitionKey(partitionKeyOut, urlOut);
        System.out.println("URL:" + urlOut.toString());

    }
}