com.francetelecom.clara.cloud.commons.InternetDomainNameCleaner.java Source code

Introduction

Here is the source code for com.francetelecom.clara.cloud.commons.InternetDomainNameCleaner.java
Source

/**
 * Copyright (C) 2015 Orange
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.francetelecom.clara.cloud.commons;

import com.google.common.base.*;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.StringUtils;

import java.util.List;

import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Cleans up a candidate domain name into a valid one. Inspired from guava InternetDomainName
 */
public final class InternetDomainNameCleaner {

    private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
    private static final Splitter DOT_SPLITTER = Splitter.on('.');
    private static final Joiner DOT_JOINER = Joiner.on('.');

    /**
     * Maximum parts (labels) in a domain name. This value arises from
     * the 255-octet limit described in
     * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
     * the fact that the encoding of each part occupies at least two bytes
     * (dot plus label externally, length byte plus label internally). Thus, if
     * all labels have the minimum size of one byte, 127 of them will fit.
     */
    private static final int MAX_PARTS = 127;

    /**
     * Maximum length of a full domain name, including separators, and
     * leaving room for the root label. See
     * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
     */
    private static final int MAX_LENGTH = 253;

    /**
     * Maximum size of a single part of a domain name. See
     * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
     */
    private static final int MAX_DOMAIN_PART_LENGTH = 63;

    /**
     * The full domain name, converted to lower case.
     */
    private final String name;

    /**
     * The parts of the domain name, converted to lower case.
     */
    private final List<String> parts;

    /**
     * Constructor used to implement {@link #from(String)}, and from subclasses.
     */
    InternetDomainNameCleaner(String name) {
        // Normalize:
        // * ASCII characters to lowercase
        // * All dot-like characters to '.'
        // * Strip trailing '.'

        name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));

        if (name.endsWith(".")) {
            name = name.substring(0, name.length() - 1);
        }

        if (name.length() > MAX_LENGTH) {
            name = name.substring(MAX_LENGTH);
        }

        this.parts = Lists.newArrayList(DOT_SPLITTER.split(name));
        while (parts.size() > MAX_PARTS) {
            parts.remove(0);
        }
        fixParts(parts);

        this.name = DOT_JOINER.join(parts);
    }

    /**
     * Returns an instance of {@link InternetDomainNameCleaner} after lenient
     * validation.  Specifically, validation against <a
     * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
     * ("Internationalizing Domain Names in Applications") is skipped, while
     * validation against <a
     * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
     * the following ways:
     * <ul>
     * <li>Any part containing non-ASCII characters is considered valid.
     * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
     * <li>Parts other than the final part may start with a digit.
     * </ul>
     *
     *
     * @param domain A domain name (not IP address)
     * @throws IllegalArgumentException if {@code name} is not syntactically valid
     *     according to isValid()
     * @since 10.0 (previously named {@code fromLenient})
     */
    public static InternetDomainNameCleaner from(String domain) {
        return new InternetDomainNameCleaner(checkNotNull(domain));
    }

    /**
     * Validation method used by {@from} to ensure that the domain name is
     * syntactically valid according to RFC 1035.
     *
     * @return Is the domain name syntactically valid?
     */
    public static void fixParts(List<String> parts) {
        final int lastIndex = parts.size() - 1;

        // Validate the last part specially, as it has different syntax rules.

        parts.set(lastIndex, getFixedPart(parts.get(lastIndex), true));

        for (int i = 0; i < lastIndex; i++) {
            String part = parts.get(i);
            parts.set(i, getFixedPart(part, false));
        }
    }

    private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");

    private static final CharMatcher PART_CHAR_MATCHER = CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);

    /**
     * Helper method for {@link #fixParts(List)}. Validates that one part of
     * a domain name is valid.
     *
     * @param part The domain name part to be validated
     * @param isFinalPart Is this the final (rightmost) domain part?
     * @return Whether the part is valid
     */
    public static String getFixedPart(String part, boolean isFinalPart) {

        // These tests could be collapsed into one big boolean expression, but
        // they have been left as independent tests for clarity.

        if (part.length() > MAX_DOMAIN_PART_LENGTH) {
            part = StringUtils.left(part, MAX_DOMAIN_PART_LENGTH);
        }

        /*
         * GWT claims to support java.lang.Character's char-classification methods,
         * but it actually only works for ASCII. So for now, assume any non-ASCII
         * characters are valid. The only place this seems to be documented is here:
         * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
         *
         * <p>ASCII characters in the part are expected to be valid per RFC 1035,
         * with underscore also being allowed due to widespread practice.
         */

        String asciiChars = CharMatcher.ASCII.retainFrom(part);

        if (!part.isEmpty() && !PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
            part = PART_CHAR_MATCHER.retainFrom(asciiChars);
        }

        // No initial or final dashes or underscores.

        if (!part.isEmpty() && DASH_MATCHER.matches(part.charAt(0))) {
            part = part.substring(1);
        } else if (!part.isEmpty() && DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
            part = part.substring(0, part.length() - 1);
        }

        /*
         * Note that we allow (in contravention of a strict interpretation of the
         * relevant RFCs) domain parts other than the last may begin with a digit
         * (for example, "3com.com"). It's important to disallow an initial digit in
         * the last part; it's the only thing that stops an IPv4 numeric address
         * like 127.0.0.1 from looking like a valid domain name.
         */

        if (!part.isEmpty() && isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
            part = part.substring(1);
        }

        return part;
    }

    /**
     * Returns the domain name, normalized to all lower case.
     */
    public String name() {
        return name;
    }

    // TODO: specify this to return the same as name(); remove name()
    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("name", name).toString();
    }

    /**
     * Equality testing is based on the text supplied by the caller,
     * after normalization as described in the class documentation. For
     * example, a non-ASCII Unicode domain name and the Punycode version
     * of the same domain name would not be considered equal.
     *
     */
    @Override
    public boolean equals(Object object) {
        if (object == this) {
            return true;
        }

        if (object instanceof InternetDomainNameCleaner) {
            InternetDomainNameCleaner that = (InternetDomainNameCleaner) object;
            return this.name.equals(that.name);
        }

        return false;
    }

    @Override
    public int hashCode() {
        return name.hashCode();
    }
}