org.commoncrawl.util.URLUtils.java Source code

Introduction

Here is the source code for org.commoncrawl.util.URLUtils.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.util;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.junit.Assert;

import com.google.common.collect.ImmutableMultimap;

/**
 * 
 * @author rana
 * 
 */
public class URLUtils {

    private static final Log LOG = LogFactory.getLog(URLUtils.class);

    /** session id normalizer **/
    private static SessionIDURLNormalizer _sessionIdNormalizer = new SessionIDURLNormalizer();

    /**
     * canonicalize url
     * 
     * @param incomingURL
     * @param stripLeadingWWW
     *          - set to true to string www. prefix from the domain if present
     * @return a canonical representation of the passed in URL that can be safely
     *         used as a replacement for the original url
     * @throws MalformedURLException
     */

    public static String canonicalizeURL(String incomingURL, boolean stripLeadingWWW) throws MalformedURLException {

        GoogleURL urlObject = new GoogleURL(incomingURL);

        if (!urlObject.isValid()) {
            throw new MalformedURLException("URL:" + incomingURL + " is invalid");
        }

        return canonicalizeURL(urlObject, stripLeadingWWW);
    }

    public static String canonicalizeURL(GoogleURL urlObject, boolean stripLeadingWWW)
            throws MalformedURLException {

        StringBuilder urlOut = new StringBuilder();

        urlOut.append(urlObject.getScheme());
        urlOut.append("://");

        if (urlObject.getUserName() != GoogleURL.emptyString) {
            urlOut.append(urlObject.getUserName());
            if (urlObject.getPassword() != GoogleURL.emptyString) {
                urlOut.append(":");
                urlOut.append(urlObject.getPassword());
            }
            urlOut.append("@");
        }

        String host = urlObject.getHost();
        if (host.endsWith(".")) {
            host = host.substring(0, host.length() - 1);
        }

        if (stripLeadingWWW) {
            if (host.startsWith("www.")) {
                // ok now. one nasty hack ... :-(
                // if root name is null or root name does not equal full host name ...
                String rootName = extractRootDomainName(host);
                if (rootName == null || !rootName.equals(host)) {
                    // striping the www. prefix
                    host = host.substring(4);
                }
            }
        }
        urlOut.append(host);

        if (urlObject.getPort() != GoogleURL.emptyString && !urlObject.getPort().equals("80")) {
            urlOut.append(":");
            urlOut.append(urlObject.getPort());
        }
        if (urlObject.getPath() != GoogleURL.emptyString) {
            int indexOfSemiColon = urlObject.getPath().indexOf(';');
            if (indexOfSemiColon != -1) {
                urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon));
            } else {
                urlOut.append(urlObject.getPath());
            }
        }
        if (urlObject.getQuery() != GoogleURL.emptyString) {
            urlOut.append("?");
            urlOut.append(urlObject.getQuery());
        }

        String canonicalizedURL = urlOut.toString();

        // phase 2 - remove common session id patterns
        canonicalizedURL = _sessionIdNormalizer.normalize(canonicalizedURL, "");

        return canonicalizedURL;
    }

    /**
     * www
     * 
     * @param a
     *          url
     * 
     * @return canonical representation of the url that can be used to identify
     *         all possible occurunces of the specified url. This is
     */

    /*
     * public static String canonicalizeURL(String incomingURL) throws
     * MalformedURLException { //TODO: Make this thread safe !!! synchronized
     * (URLUtils.class) { //phase 1 String normalizedURL =
     * URLNormalizer.normalizeString(incomingURL); // phase 2 normalize host name
     * and remove www
     * 
     * // get hostname URLUtils.fastGetResult hostNameLoc =
     * URLUtils.fastGetHostFromURL(normalizedURL);
     * 
     * if (hostNameLoc != null) { String hostName =
     * normalizedURL.substring(hostNameLoc.offset,hostNameLoc.offset +
     * hostNameLoc.length); String normalizedName =
     * URLUtils.normalizeHostName(hostName);
     * 
     * String newNormalizedURL = normalizedURL.substring(0,hostNameLoc.offset) +
     * normalizedName; newNormalizedURL +=
     * normalizedURL.substring(hostNameLoc.offset+hostNameLoc.length);
     * 
     * // phase 3 - remove common session id patterns normalizedURL =
     * _sessionIdNormalizer.normalize(newNormalizedURL, "");
     * 
     * // return the url return normalizedURL; } return null; } }
     */

    /**
     * get canonical url fingerprint for the the given url
     * 
     * @param urlString
     * @return canonicalized url's fingerprint id
     */
    public static long getCanonicalURLFingerprint(String incomingURL, boolean stripLeadingWWW)
            throws MalformedURLException {
        String canonicalURL = canonicalizeURL(incomingURL, stripLeadingWWW);
        if (canonicalURL != null) {
            return URLFingerprint.generate64BitURLFPrint(canonicalURL);
        }
        return 0;
    }

    /**
     * get a full url fingerprint (domain hash and url fingeprint) for the passed
     * in url
     * 
     * @param urlString
     * @return
     */
    public static URLFP getURLFPFromURL(String urlString, boolean stripLeadingWWW) {

        try {
            // canonicalize the incoming url ...
            String canonicalURL = URLUtils.canonicalizeURL(urlString, stripLeadingWWW);

            if (canonicalURL != null) {
                return getURLFPFromCanonicalURL(canonicalURL);
            }
        } catch (MalformedURLException e) {
        }
        return null;
    }

    public static URLFP getURLFPFromURLObject(GoogleURL urlObject) {
        try {
            // canonicalize the incoming url ...
            String canonicalURL = URLUtils.canonicalizeURL(urlObject, false);

            if (canonicalURL != null) {
                return getURLFPFromCanonicalURL(canonicalURL);
            }
        } catch (MalformedURLException e) {
        }
        return null;
    }

    public static URLFP getURLFPFromCanonicalURL(String canonicalURL) {
        // get url object
        GoogleURL urlObject = new GoogleURL(canonicalURL);

        if (urlObject.isValid()) {

            // create a url fp record
            URLFP urlFP = new URLFP();

            urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

            String hostName = urlObject.getHost();
            String rootDomainName = URLUtils.extractRootDomainName(hostName);

            if (hostName != null && rootDomainName != null) {
                urlFP.setDomainHash(URLFingerprint.generate32BitHostFP(hostName));
                urlFP.setRootDomainHash(URLFingerprint.generate32BitHostFP(rootDomainName));
                return urlFP;
            }
        }
        LOG.warn("####FAILED TO CANONCALIZER INVALID URL:" + canonicalURL);
        return null;
    }

    /**
     * get URLFPV2 for a host
     * 
     */
    public static URLFPV2 getURLFPV2FromHost(String host) {
        return getURLFPV2FromURL("http://" + host + "/");
    }

    /**
     * get new urlfp from urstring... always string leading www
     * 
     * @param urlString
     * @return
     */
    public static URLFPV2 getURLFPV2FromURL(String urlString) {

        try {
            // canonicalize the incoming url ...
            String canonicalURL = URLUtils.canonicalizeURL(urlString, false);

            if (canonicalURL != null) {
                return getURLFPV2FromCanonicalURL(canonicalURL);
            }
        } catch (MalformedURLException e) {
        }
        return null;
    }

    public static URLFPV2 getURLFPV2FromURLObject(GoogleURL urlObject) {
        try {
            // canonicalize the incoming url ...
            String canonicalURL = URLUtils.canonicalizeURL(urlObject, false);

            if (canonicalURL != null) {
                return getURLFPV2FromCanonicalURL(canonicalURL);
            }
        } catch (MalformedURLException e) {
        }
        return null;
    }

    public static URLFPV2 getURLFPV2FromCanonicalURL(String canonicalURL) {

        // create a url fp record
        URLFPV2 urlFP = new URLFPV2();

        urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

        String hostName = fastGetHostFromURL(canonicalURL);
        String rootDomainName = null;

        if (hostName != null)
            rootDomainName = URLUtils.extractRootDomainName(hostName);

        if (hostName != null && rootDomainName != null) {
            // ok we want to strip the leading www. if necessary
            if (hostName.startsWith("www.")) {
                // ok now. one nasty hack ... :-(
                // if root name does not equal full host name ...
                if (!rootDomainName.equals(hostName)) {
                    // strip the www. prefix
                    hostName = hostName.substring(4);
                }
            }
            urlFP.setDomainHash(FPGenerator.std64.fp(hostName));
            urlFP.setRootDomainHash(FPGenerator.std64.fp(rootDomainName));
            return urlFP;
        }
        return null;
    }

    public static String fastGetHostFromURL(String urlString) {

        int hostStart = urlString.indexOf(":");
        if (hostStart != -1) {

            hostStart++;

            int urlLength = urlString.length();

            while (hostStart < urlString.length()) {
                char nextChar = urlString.charAt(hostStart);
                if (nextChar != '/' && nextChar != '\\' && nextChar != '\n' && nextChar != '\r' && nextChar != '\t'
                        && nextChar != ' ') {
                    break;
                }
                hostStart++;
            }

            if (hostStart < urlLength) {

                int hostEnd = hostStart + 1;

                while (hostEnd < urlLength) {
                    char nextChar = urlString.charAt(hostEnd);
                    if (nextChar == '/' || nextChar == '?' || nextChar == ';' || nextChar == '#')
                        break;
                    hostEnd++;
                }

                int indexOfAt = urlString.indexOf("@", hostStart);
                if (indexOfAt != -1 && indexOfAt < hostEnd) {
                    hostStart = indexOfAt + 1;
                }

                String host = urlString.substring(hostStart, hostEnd);

                int hostLength = host.length();
                int colonEnd = host.indexOf(":");
                if (colonEnd != -1) {
                    hostLength = colonEnd;
                    host = urlString.substring(hostStart, hostStart + hostLength);
                }

                GoogleURL urlObject = new GoogleURL("http://" + host);

                if (urlObject.isValid()) {
                    return urlObject.getHost();
                }
            }
            /*
             * host = host.replaceAll("((%20)|\\s)", "");
             * 
             * if (!invalidDomainCharactersRegEx.matcher(host).matches()) {
             * 
             * if (host.length() >= 1) { if (host.charAt(0) >= '0' && host.charAt(0)
             * <= '9') { if (numericOnly.matcher(host).matches()) { try { int
             * ipAddress = (int) Long.parseLong(host); return
             * IPAddressUtils.IntegerToIPAddressString(ipAddress); } catch
             * (NumberFormatException e) { return null; } } } } return host; }
             */

        }
        return null;
    }

    public static class fastGetResult {

        public fastGetResult(int offset, int length) {
            this.offset = offset;
            this.length = length;
        }

        public int offset;
        public int length;
    }

    public static fastGetResult fastGetHostFromTextURL(byte[] charStream, int offset, int length) {

        char schemeEnd[] = { ':', '/', '/' };
        char at[] = { '@' };
        char slash[] = { '/' };
        char questionMark[] = { '?' };
        char hashMark[] = { '#' };
        char colon[] = { ':' };

        int indexOfSchemeEnd = indexOf(charStream, offset, length, schemeEnd, 0, 3, 0);

        if (indexOfSchemeEnd != -1) {
            int hostStart = indexOfSchemeEnd + 3;
            int lengthRemaining = length - hostStart;
            int hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, slash, 0, 1, 0);
            if (hostEnd == -1) {
                hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, questionMark, 0, 1, 0);
            }
            if (hostEnd == -1) {
                hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, hashMark, 0, 1, 0);
            }
            if (hostEnd != -1) {
                lengthRemaining = hostEnd;
            }
            int indexOfColon = indexOf(charStream, offset + hostStart, lengthRemaining, colon, 0, 1, 0);

            if (indexOfColon != -1) {
                lengthRemaining = indexOfColon;
            }
            int indexOfAt = indexOf(charStream, offset + hostStart, lengthRemaining, at, 0, 1, 0);
            if (indexOfAt != -1) {
                lengthRemaining = lengthRemaining - (indexOfAt + 1);
                hostStart = hostStart + indexOfAt + 1;
            }

            return new fastGetResult(hostStart, lengthRemaining);
        }
        return null;
    }

    static int indexOf(byte[] source, int sourceOffset, int sourceCount, char[] target, int targetOffset,
            int targetCount, int fromIndex) {
        if (fromIndex >= sourceCount) {
            return (targetCount == 0 ? sourceCount : -1);
        }
        if (fromIndex < 0) {
            fromIndex = 0;
        }
        if (targetCount == 0) {
            return fromIndex;
        }

        char first = target[targetOffset];
        int max = sourceOffset + (sourceCount - targetCount);

        for (int i = sourceOffset + fromIndex; i <= max; i++) {
            /* Look for first character. */
            if (source[i] != first) {
                while (++i <= max && source[i] != first)
                    ;
            }

            /* Found first character, now look at the rest of v2 */
            if (i <= max) {
                int j = i + 1;
                int end = j + targetCount - 1;
                for (int k = targetOffset + 1; j < end && source[j] == target[k]; j++, k++)
                    ;

                if (j == end) {
                    /* Found whole string. */
                    return i - sourceOffset;
                }
            }
        }
        return -1;
    }

    public static String invertHostName(String hostNameIn) {
        StringBuffer hostNameOut = new StringBuffer();

        char tokens[] = hostNameIn.toCharArray();
        int lastScanStart = hostNameIn.length() - 1;
        int currentIndex = lastScanStart;

        while (currentIndex != -1) {
            if (tokens[currentIndex] == '.') {
                if (lastScanStart - currentIndex != 0) {
                    hostNameOut.append(tokens, currentIndex + 1, lastScanStart - currentIndex);
                    if (currentIndex != 0)
                        hostNameOut.append('.');
                }
                lastScanStart = currentIndex - 1;
            } else if (currentIndex == 0) {
                if (lastScanStart - currentIndex + 1 != 0) {
                    hostNameOut.append(tokens, 0, lastScanStart + 1);
                }
            }
            currentIndex--;
        }

        return hostNameOut.toString();
    }

    public static int invertHostNameFast(byte[] tokens, int offset, int length, byte[] destinationBuffer) {

        int lastScanStart = offset + length - 1;
        int currentIndex = lastScanStart;
        int destinationOffset = 0;

        while (currentIndex >= offset) {
            if (tokens[currentIndex] == '.') {
                if (lastScanStart - currentIndex != 0) {
                    System.arraycopy(tokens, currentIndex + 1, destinationBuffer, destinationOffset,
                            lastScanStart - currentIndex);
                    destinationOffset += (lastScanStart - currentIndex);
                    if (currentIndex != 0) {
                        destinationBuffer[destinationOffset++] = '.';
                    }
                }
                lastScanStart = currentIndex - 1;
            } else if (currentIndex == offset) {
                if (lastScanStart - currentIndex + 1 != 0) {
                    System.arraycopy(tokens, offset, destinationBuffer, destinationOffset,
                            (lastScanStart - currentIndex + 1));
                    destinationOffset += (lastScanStart - currentIndex + 1);
                }
            }
            currentIndex--;
        }

        return destinationOffset;
    }

    public static String normalizeHostName(String hostName, boolean stripLeadingWWW) {

        if (ipAddressRegEx.matcher(hostName).matches()) {
            return hostName;
        }

        // we are going to normalize it first , so make a copy
        String normalizedHostName = hostName.toLowerCase();
        // next check for trailing .
        while (normalizedHostName.endsWith(".")) {
            normalizedHostName = normalizedHostName.substring(0, normalizedHostName.length() - 1);
        }
        while (normalizedHostName.startsWith(".")) {
            normalizedHostName = normalizedHostName.substring(1);
        }

        normalizedHostName = normalizedHostName.replaceAll("((%20)|\\s)", "");

        if (!invalidDomainCharactersRegEx.matcher(normalizedHostName).matches()) {

            if (stripLeadingWWW) {
                String rootName = extractRootDomainName(normalizedHostName);

                if (rootName != null) {
                    String subDomain = "";

                    if (rootName.length() != normalizedHostName.length()) {
                        subDomain = normalizedHostName.substring(0,
                                normalizedHostName.length() - rootName.length());

                        if (subDomain.startsWith("www.")) {
                            normalizedHostName = normalizedHostName.substring(4);
                        }
                    }
                }
            }
            return normalizedHostName;
        }
        return null;
    }

    public static String getHostNameFromURLKey(Text key) {

        fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key.getLength());

        if (result != null && result.length != 0) {
            String hostName = new String(key.getBytes(), result.offset, result.length);
            return hostName;
        }
        return null;
    }

    private static void testURL(String url) {
        String hostName = getHostNameFromURLKey(new Text(url));
        try {
            URL urlObject = new URL(url);
            if (hostName == null) {
                Assert.assertTrue(urlObject.getHost().length() == 0);
            } else {
                Assert.assertTrue(urlObject.getHost().equals(hostName));
            }
        } catch (MalformedURLException e) {
            Assert.assertTrue(getHostNameFromURLKey(new Text(url)) == null);
        }
    }

    private static int findTLDNameEndLength(byte[] stream, int offset, int length) {
        boolean foundTLDStartMarker = false;
        int i = 0;
        for (i = 0; i < length; ++i) {
            if (stream[offset + i] == '!' && !foundTLDStartMarker) {
                foundTLDStartMarker = true;
            } else if (stream[offset + i] == '.' && foundTLDStartMarker) {
                break;
            }
        }
        return i;
    }

    private static ImmutableMultimap<String, String> gTLDMultiMap = null;

    private static int getNextTokenPos(String candidate, int startPos) {
        while (startPos > 0) {
            if (candidate.charAt(startPos - 1) == '.') {
                break;
            }
            --startPos;
        }
        return startPos;
    }

    public static boolean isTLDStopWord(String candidate) {
        return TLDNamesCollection.getSecondaryNames(candidate).size() != 0;
    }

    private static String buildRootNameString(String candidateString, String[] parts, int rootNameIndex) {
        int partsToInclude = parts.length - rootNameIndex;
        int dotsToInclude = partsToInclude - 1;

        // initial root name length is dot count
        int rootNameLength = dotsToInclude;
        for (int i = rootNameIndex; i < parts.length; ++i) {
            rootNameLength += parts[i].length();
        }
        return candidateString.substring(candidateString.length() - rootNameLength);
    }

    public static String extractTLDName(String candidate) {

        // special case for ip addresses
        if (ipAddressRegEx.matcher(candidate).matches()) {
            return "inaddr-arpa.arpa";
        }

        if (candidate.endsWith(".")) {
            candidate = candidate.substring(0, candidate.length() - 1);
        }
        if (candidate.startsWith("*") && candidate.length() > 1) {
            candidate = candidate.substring(1);
        }
        if (candidate.length() != 0) {
            if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
                String parts[] = candidate.split("\\.");
                if (parts.length >= 2) {
                    Collection<String> secondaryNames = TLDNamesCollection
                            .getSecondaryNames(parts[parts.length - 1]);

                    if (secondaryNames.size() != 0) {
                        // see if second to last part matches secondary names for this TLD
                        // or there is a wildcard expression for secondary name in rule set
                        if (secondaryNames.contains(parts[parts.length - 2]) || secondaryNames.contains("*")) {
                            // ok secondary part is potentianlly part of secondary name ...

                            // check to see the part in not explicitly excluded ...
                            if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                                // in this case, second to last part is NOT part of secondary
                                // name
                                return buildRootNameString(candidate, parts, parts.length - 1);
                            } else {
                                // otherwise, TLD contains 2 parts
                                return buildRootNameString(candidate, parts, parts.length - 2);
                            }
                        }
                        // ok second to last part does not match set of known secondary
                        // names
                        else {
                            // make a wildcard string matching secondary name
                            String extendedWildcard = "*." + parts[parts.length - 2];
                            // if match, then this implies secondary name has two components
                            if (secondaryNames.contains(extendedWildcard)) {

                                if (parts.length >= 3) {
                                    // this implies that there must be four parts to the name to
                                    // extract root
                                    // unless exlusion rule applies
                                    String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                                            + parts[parts.length - 2];

                                    // if exclusion rule is present ...
                                    if (secondaryNames.contains(exclusionRule2)) {
                                        // third part is NOT part of secondary name
                                        return buildRootNameString(candidate, parts, parts.length - 2);
                                    } else {
                                        // ok extended wildcard matched. last 3 parts are part of
                                        // the TLD
                                        if (parts.length >= 4) {
                                            return buildRootNameString(candidate, parts, parts.length - 3);
                                        }
                                    }
                                }
                            }
                            // at this point ... if the null name exists ...
                            else if (secondaryNames.contains("")) {
                                // only last item is part of TLD
                                return buildRootNameString(candidate, parts, parts.length - 1);
                            }
                        }
                    }
                }
            }
        }
        return null;
    }

    public static String extractRootDomainName(String candidate) {

        // special case for ip addresses
        if (ipAddressRegEx.matcher(candidate).matches()) {
            return candidate;
        }

        if (candidate.endsWith(".")) {
            candidate = candidate.substring(0, candidate.length() - 1);
        }
        if (candidate.startsWith("*") && candidate.length() > 1) {
            candidate = candidate.substring(1);
        }
        if (candidate.length() != 0) {
            if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
                String parts[] = candidate.split("\\.");
                if (parts.length >= 2) {
                    Collection<String> secondaryNames = TLDNamesCollection
                            .getSecondaryNames(parts[parts.length - 1]);

                    if (secondaryNames.size() != 0) {
                        // see if second to last part matches secondary names for this TLD
                        // or there is a wildcard expression for secondary name in rule set
                        if (secondaryNames.contains(parts[parts.length - 2]) || secondaryNames.contains("*")) {
                            // ok secondary part is potentianlly part of secondary name ...

                            // check to see the part in not explicitly excluded ...
                            if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                                // in this case, this is an explicit override. second to last
                                // part is NOT part of secondary name
                                return buildRootNameString(candidate, parts, parts.length - 2);
                            } else {
                                // otherwise, we need at least three parts
                                if (parts.length >= 3) {
                                    return buildRootNameString(candidate, parts, parts.length - 3);
                                }
                            }
                        }
                        // ok second to last part does not match set of known secondary
                        // names
                        else {
                            // make a wildcard string matching secondary name
                            String extendedWildcard = "*." + parts[parts.length - 2];
                            // if match, then this implies secondary name has two components
                            if (secondaryNames.contains(extendedWildcard)) {

                                if (parts.length >= 3) {
                                    // this implies that there must be four parts to the name to
                                    // extract root
                                    // unless exlusion rule applies
                                    String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                                            + parts[parts.length - 2];

                                    // if exclusion rule is present ...
                                    if (secondaryNames.contains(exclusionRule2)) {
                                        // third part is NOT part of secondary name
                                        return buildRootNameString(candidate, parts, parts.length - 3);
                                    } else {
                                        // ok extended wildcard matched. we need 4 parts minimum
                                        if (parts.length >= 4) {
                                            return buildRootNameString(candidate, parts, parts.length - 4);
                                        }
                                    }
                                }
                            }
                            // at this point ... if the null name exists ...
                            else if (secondaryNames.contains("")) {
                                // return second part as root name
                                return buildRootNameString(candidate, parts, parts.length - 2);
                            }
                        }
                    }
                }
            }
        }
        return null;
    }

    /** The maximum length of a Name */
    private static final int MAXNAME = 255;

    /** The maximum length of a label a Name */
    private static final int MAXLABEL = 63;

    /** The maximum number of labels in a Name */
    private static final int MAXLABELS = 128;

    static Pattern invalidDomainCharactersRegEx = Pattern.compile("[^0-9a-z\\-\\._]");
    static Pattern ipAddressRegEx = Pattern.compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$");
    static Pattern numericOnly = Pattern.compile("[0-9]*$");

    public static boolean isValidDomainName(String name) {

        // check for invalid length (max 255 characters)
        if (name.length() > MAXNAME) {
            return false;
        }

        String candidate = name.toLowerCase();

        // check to see if this is an ip address
        if (ipAddressRegEx.matcher(candidate).matches()) {
            return true;
        }

        // check for invalid characters
        if (invalidDomainCharactersRegEx.matcher(candidate).matches()) {
            return false;
        }
        // split into parts
        String[] parts = name.split("\\.");

        // check for max labels constraint
        if (parts.length > MAXLABELS) {
            return false;
        }
        return extractRootDomainName(candidate) != null;
    }

    public static String invertAndMarkTLDNameStartInString(String hostName) {
        // and invert it ...
        hostName = URLUtils.invertHostName(hostName);
        // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName);
        // create a buffer
        StringBuffer tempBuffer = new StringBuffer(hostName.length());
        // now walk it skipping tld names
        StringTokenizer tokenizer = new StringTokenizer(hostName, ".");
        boolean foundTLDName = false;
        while (tokenizer.hasMoreElements()) {
            char delimiterToUse = '.';

            String token = tokenizer.nextToken();
            if (!foundTLDName) {
                if (!URLUtils.isTLDStopWord(token)) {
                    foundTLDName = true;
                    delimiterToUse = '!';
                }
            }
            if (tempBuffer.length() != 0)
                tempBuffer.append(delimiterToUse);
            tempBuffer.append(token);
        }
        return tempBuffer.toString();
    }

    public static int findTLDNameEndLengthInMarkedString(String markedString) {
        boolean foundTLDStartMarker = false;
        int i = 0;
        for (i = 0; i < markedString.length(); ++i) {
            if (markedString.charAt(i) == '!' && !foundTLDStartMarker) {
                foundTLDStartMarker = true;
            } else if (markedString.charAt(i) == '.' && foundTLDStartMarker) {
                break;
            }
        }
        return i;
    }

    public static int findTLDNameEndLengthInMarkedStream(byte[] stream, int offset, int length) {
        boolean foundTLDStartMarker = false;
        int i = 0;
        for (i = 0; i < length; ++i) {
            if (stream[offset + i] == '!' && !foundTLDStartMarker) {
                foundTLDStartMarker = true;
            } else if (stream[offset + i] == '.' && foundTLDStartMarker) {
                break;
            }
        }
        return i;
    }

    private static void testURLNameInversion(String name) {
        System.out.println("Inverting name:" + name + " result:" + invertHostName(name));
    }

    private static void testTLDNameDetection(String name) {
        byte[] bytes = null;
        try {
            bytes = name.getBytes("UTF8");
        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        int nameLen = findTLDNameEndLength(bytes, 0, bytes.length);
        System.out.println("TLD Name for:" + name + " is:" + name.substring(0, nameLen));
    }

    private static void testHostNameInvertFast(int offset, String hostName) {
        byte[] tokenArray = hostName.getBytes();
        byte[] duplicate = new byte[tokenArray.length + offset + 4];
        System.arraycopy(tokenArray, 0, duplicate, offset, tokenArray.length);
        byte[] destinationArray = new byte[tokenArray.length + 4];
        int destinationBufferSize = invertHostNameFast(duplicate, offset, tokenArray.length, destinationArray);

        System.out.println(
                "Inverted:" + hostName + " Produced:" + new String(destinationArray, 0, destinationBufferSize));
    }

    static String replicateNameNormalization(String hostNameIn) {
        // and invert it ...
        String hostName = URLUtils.invertHostName(hostNameIn);
        // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName);
        // create a buffer
        StringBuffer tempBuffer = new StringBuffer(hostName.length());
        // now walk it skipping tld names
        StringTokenizer tokenizer = new StringTokenizer(hostName, ".");
        boolean foundTLDName = false;
        while (tokenizer.hasMoreElements()) {
            char delimiterToUse = '.';

            String token = tokenizer.nextToken();
            if (!foundTLDName) {
                if (!URLUtils.isTLDStopWord(token)) {
                    foundTLDName = true;
                    delimiterToUse = '!';
                }
            }
            tempBuffer.append(delimiterToUse);
            tempBuffer.append(token);
        }
        hostName = tempBuffer.toString();

        return hostName;
    }

    static class CanonicalizationTestCase {
        String originalURL;
        String expectedURL;

        CanonicalizationTestCase(String originalURL, String expectedURL) {
            this.originalURL = originalURL;
            this.expectedURL = expectedURL;
        }

        void validate() {
            try {
                String resultingURL = canonicalizeURL(originalURL, false);
                Assert.assertEquals(resultingURL, expectedURL);
            } catch (MalformedURLException e) {
                if (expectedURL != null) {
                    Assert.assertTrue(false);
                }
            }
        }

    }

    public static class URLFPV2RawComparator implements RawComparator<URLFPV2> {

        DataInputBuffer keyReader1 = new DataInputBuffer();
        DataInputBuffer keyReader2 = new DataInputBuffer();
        URLFPV2 fp1 = new URLFPV2();
        URLFPV2 fp2 = new URLFPV2();

        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            keyReader1.reset(b1, s1, l1);
            keyReader2.reset(b2, s2, l2);

            try {
                // read first byte of both streams
                int s1FirstByte = keyReader1.read();
                int s2FirstByte = keyReader2.read();

                boolean s1IsOldFormat = false;
                boolean s2IsOldFormat = false;

                if (s1FirstByte == 0 || s1FirstByte == -1) {
                    s1IsOldFormat = true;
                }

                if (s2FirstByte == 0 || s2FirstByte == -1) {
                    s2IsOldFormat = true;
                }

                keyReader1.skip(1); // skip next byte
                fp1.setDomainHash(WritableUtils.readVLong(keyReader1));
                keyReader2.skip(1); // skip next byte
                fp2.setDomainHash(WritableUtils.readVLong(keyReader2));

                int result = ((Long) fp1.getDomainHash()).compareTo(fp2.getDomainHash());

                if (result == 0) {
                    keyReader1.skip((s1IsOldFormat) ? 2 : 1); // id field only
                    fp1.setUrlHash(WritableUtils.readVLong(keyReader1));

                    keyReader2.skip((s2IsOldFormat) ? 2 : 1); // id field only
                    fp2.setUrlHash(WritableUtils.readVLong(keyReader2));

                    result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash());
                }
                return result;
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public int compare(URLFPV2 fp1, URLFPV2 fp2) {
            int result = ((Long) fp1.getDomainHash()).compareTo(fp2.getDomainHash());

            if (result == 0) {
                result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash());
            }
            return result;
        }

        static void validateComparator() {
            URLFPV2 fp1 = new URLFPV2();
            URLFPV2 fp2 = new URLFPV2();
            URLFPV2 fp3 = new URLFPV2();
            URLFPV2 fp4 = new URLFPV2();

            fp1.setDomainHash(1L);
            fp2.setDomainHash(1L);
            fp3.setDomainHash(2L);
            fp4.setDomainHash(2L);

            fp1.setUrlHash(10L);
            fp2.setUrlHash(9L);
            fp3.setUrlHash(18L);
            fp4.setUrlHash(20L);

            DataOutputBuffer buffer1 = new DataOutputBuffer();
            DataOutputBuffer buffer2 = new DataOutputBuffer();
            DataOutputBuffer buffer3 = new DataOutputBuffer();
            DataOutputBuffer buffer4 = new DataOutputBuffer();

            URLFPV2RawComparator comparator = new URLFPV2RawComparator();

            Assert.assertTrue(comparator.compare(fp1, fp2) == 1);
            Assert.assertTrue(comparator.compare(fp2, fp1) == -1);
            Assert.assertTrue(comparator.compare(fp1, fp3) == -1);
            Assert.assertTrue(comparator.compare(fp3, fp1) == 1);
            Assert.assertTrue(comparator.compare(fp4, fp3) == 1);
            Assert.assertTrue(comparator.compare(fp3, fp4) == -1);

            try {
                BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT;
                fp1.write(buffer1);
                BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT;
                fp2.write(buffer2);
                BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT;
                fp3.write(buffer3);
                BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT;
                fp4.write(buffer4);

            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1.getLength(), buffer2.getData(), 0,
                    buffer2.getLength()) == 1);
            Assert.assertTrue(comparator.compare(buffer2.getData(), 0, buffer2.getLength(), buffer1.getData(), 0,
                    buffer1.getLength()) == -1);
            Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1.getLength(), buffer3.getData(), 0,
                    buffer3.getLength()) == -1);
            Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3.getLength(), buffer1.getData(), 0,
                    buffer1.getLength()) == 1);
            Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3.getLength(), buffer4.getData(), 0,
                    buffer4.getLength()) == -1);
            Assert.assertTrue(comparator.compare(buffer4.getData(), 0, buffer4.getLength(), buffer3.getData(), 0,
                    buffer3.getLength()) == 1);

        }

    }

    public static CanonicalizationTestCase[] testCases = {
            new CanonicalizationTestCase("http://foo.bar.com.#?", "http://foo.bar.com/"),
            new CanonicalizationTestCase("http://foo.bar.com./;msg1234FDF FDFDFDF FDFD?param1=test",
                    "http://foo.bar.com/?param1=test"),
            new CanonicalizationTestCase("http://foo.bar.com./;msg1234FDF FDFDFDF FDFD", "http://foo.bar.com/"),
            new CanonicalizationTestCase("http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD",
                    "http://foo.bar.com/subpath/"),
            new CanonicalizationTestCase("http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD?param=1",
                    "http://foo.bar.com/subpath/?param=1"),
            new CanonicalizationTestCase("http://foo.bar.com.#REF=24242", "http://foo.bar.com/"),
            new CanonicalizationTestCase(
                    "http://www.lakeshorelearning.com/order/onlineOrder.jsp;jsessionid=KxMMpRGgPpC1ktZ1pJJCZF1MmmFxZHPnyrNJhBmWJGHkhcL5Hd4p!-617247554!NONE?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941",
                    "http://www.lakeshorelearning.com/order/onlineOrder.jsp?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941"),
            new CanonicalizationTestCase(
                    "http://www.emeraldinsight.com/Insight/menuNavigation.do;jsessionid=A17FC93E864C2F8B3709F63558BA69DB?hdAction=InsightHome",
                    "http://www.emeraldinsight.com/Insight/menuNavigation.do?hdAction=InsightHome")

    };

    public static void validatateCanonicalization() {
        for (CanonicalizationTestCase testCase : testCases) {
            testCase.validate();
        }
    }

    public static void main(String[] args) {

        URLFPV2RawComparator.validateComparator();

        URLFPV2 fingerprint = getURLFPV2FromURL("http://www.gmail.fr/");
        URLFPV2 fingerprint2 = getURLFPV2FromURL("http://gmail.fr/");

        Assert.assertTrue(fingerprint.getDomainHash() == fingerprint2.getDomainHash());

        testRootDomainExtractor();
        Assert.assertTrue(isValidDomainName("192.168.0.1"));
        Assert.assertFalse(isValidDomainName("192.168.0.1.1"));
        Assert.assertTrue(URLUtils.normalizeHostName("192.168.0.1", false).equals("192.168.0.1"));

        validatateCanonicalization();
    }

    private static void testRootDomainExtractor() {

        System.out.println(extractRootDomainName("www.ret.gov.au") + "," + extractTLDName("www.ret.gov.au"));
        System.out.println(extractRootDomainName("www.jobshop.ro") + "," + extractTLDName("www.jobshop.ro"));
        System.out.println(extractRootDomainName("www.ne.jp") + "," + extractTLDName("www.ne.jp"));
        System.out.println(extractRootDomainName("foo.ac.jp") + "," + extractTLDName("foo.ac.jp"));
        System.out.println(extractRootDomainName("aichi.jp") + "," + extractTLDName("aichi.jp"));
        System.out.println(extractRootDomainName("bochi.aichi.jp") + "," + extractTLDName("bochi.aichi.jp"));
        System.out.println(
                extractRootDomainName("more.bochi.aichi.jp") + "," + extractTLDName("more.bochi.aichi.jp"));
        System.out.println(extractRootDomainName("metro.tokyo.jp") + "," + extractTLDName("metro.tokyo.jp"));
        System.out.println(
                extractRootDomainName("fluff.metro.tokyo.jp") + "," + extractTLDName("fluff.metro.tokyo.jp"));
        System.out.println(
                extractRootDomainName("www.pref.hokkaido.jp") + "," + extractTLDName("www.pref.hokkaido.jp"));
        System.out.println(extractRootDomainName("www.subdomain.pref2.hokkaido.jp") + ","
                + extractTLDName("www.subdomain.pref2.hokkaido.jp"));
        System.out.println(extractRootDomainName("gigaom.com") + "," + extractTLDName("gigaom.com"));
        System.out.println(extractRootDomainName("www.gigaom.com.cn") + "," + extractTLDName("www.gigaom.com.cn"));
        System.out.println(extractRootDomainName("www.foobar.idf.il") + "," + extractTLDName("www.foobar.idf.il"));
        System.out.println(extractRootDomainName("192.168.0.1") + "," + extractTLDName("192.168.0.1"));

        Assert.assertTrue(extractRootDomainName(".gigaom.com").equals("gigaom.com"));
        Assert.assertTrue(extractRootDomainName("*.gigaom.com").equals("gigaom.com"));
        Assert.assertTrue(extractRootDomainName("www.gigaom.com").equals("gigaom.com"));
        Assert.assertTrue(extractRootDomainName("foobar.foo.cn").equals("foo.cn"));
        Assert.assertTrue(extractRootDomainName("foobar.google.com.cn").equals("google.com.cn"));
        Assert.assertTrue(extractRootDomainName("google.com.cn").equals("google.com.cn"));
        Assert.assertTrue(extractRootDomainName("cn") == null);
        Assert.assertTrue(extractRootDomainName("ab.ca") == null);
        Assert.assertTrue(extractRootDomainName("somedomain.ab.ca").equals("somedomain.ab.ca"));
        Assert.assertTrue(extractRootDomainName("www.somedomain.ab.ca").equals("somedomain.ab.ca"));
        Assert.assertTrue(extractRootDomainName("www.somedomain .ab.ca") == null);

    }

    private static void utilsTest() throws Exception {

        /*
         * testHostNameInvertFast(4,"www.google.com");
         * testHostNameInvertFast(4,"google.com.");
         * 
         * System.out.println(invertHostName("news.bbc.co.uk."));
         * System.out.println(invertHostName("www.zubia-alam.blogspot.com")+".");
         * System.out.println(invertHostName("zubia-alam.blogspot.com")+".");
         * System.out.println("compareTo returned:" +
         * (invertHostName("zubia-alam.blogspot.com"
         * )+".").compareTo((invertHostName("zubia-alam.blogspot.com"))));
         * System.out.println("compareTo returned:" +
         * "x-factor-e.".compareTo("x-factor."));
         * 
         * testHostNameInvertFast(4,"www.google.com");
         * testHostNameInvertFast(4,"google.com.");
         * testHostNameInvertFast(4,".google.com");
         * testHostNameInvertFast(4,"google.com.");
         * 
         * testHostNameInvertFast(4,"www.google.com");
         * testHostNameInvertFast(4,"google.com.");
         * testHostNameInvertFast(4,".google.com");
         * testHostNameInvertFast(4,"google.com.");
         */

        Assert.assertTrue(normalizeHostName(".gigaom. com", true).equals("gigaom.com"));
        Assert.assertTrue(normalizeHostName("%20gigaom.com", true).equals("gigaom.com"));
        Assert.assertTrue(normalizeHostName("www.gigaom.com", true).equals("gigaom.com"));
        Assert.assertTrue(normalizeHostName("www.gigaom.com.", true).equals("gigaom.com"));
        Assert.assertTrue(normalizeHostName("www.gigaom.com", false).equals("www.gigaom.com"));
        Assert.assertTrue(normalizeHostName("www.gigaom.com.", false).equals("www.gigaom.com"));
        Assert.assertTrue(normalizeHostName(".com.", true).equals("com"));
        Assert.assertTrue(normalizeHostName("..gigaom.com..", true).equals("gigaom.com"));

        Assert.assertTrue(normalizeHostName("aisa.org.af.", true).equals("aisa.org.af"));

        /*
         * testTLDNameDetection("com!google.www");
         * testTLDNameDetection("au.com!google.www");
         * testTLDNameDetection("com.google.www");
         * 
         * testURLNameInversion("www.google.com");
         * testURLNameInversion("google.com."); testURLNameInversion(".google.com");
         * testURLNameInversion("google.com.");
         * 
         * testURLNameInversion(invertHostName("www.google.com"));
         * testURLNameInversion(invertHostName("google.com."));
         * testURLNameInversion(invertHostName(".google.com"));
         * testURLNameInversion(invertHostName("google.com."));
         * 
         * testURL("http://www.google.com/"); testURL("http://google.com:8080/");
         * testURL("http://google.com:8080"); testURL("http://google.com");
         * testURL("http://ahad@google.com");
         * testURL("http://ahad:password@google.com");
         * testURL("http://ahad:password@google.com/"); testURL("http:///");
         */
    }

}