pl.nask.hsn2.normalizers.UrlNormalizer.java Source code

Java tutorial

Introduction

Here is the source code for pl.nask.hsn2.normalizers.UrlNormalizer.java

Source

/*
 * Copyright (c) NASK, NCSC
 * 
 * This file is part of HoneySpider Network 2.0.
 * 
 * This is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
    
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
    
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package pl.nask.hsn2.normalizers;

import org.apache.commons.httpclient.URIException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import pl.nask.hsn2.normalizers.URLNormalizerUtils.EncodingType;

public class UrlNormalizer {
    private static final Logger LOG = LoggerFactory.getLogger(UrlNormalizer.class);
    public static final String DEFAULT_SCHEMA = "http";

    private final String original;
    //access for tests only
    StringBuilder toProcess;
    private URI internalURI;

    URI getInternalURI() {
        return this.internalURI;
    }

    public UrlNormalizer(String uri) {
        this.original = uri.trim();
        ;
        toProcess = new StringBuilder(this.original);
        int i = toProcess.indexOf(" ");
        if (i > 0) {
            LOG.warn("URL contains unescaped space(s) will be trimmed at position [{}]:({})", i + 1,
                    toProcess.toString());
            toProcess.delete(i, toProcess.length());
        }
        for (int c = 0; c < toProcess.length(); c++) {
            if (Character.isISOControl(toProcess.codePointAt(c))) {
                toProcess.deleteCharAt(c--);
            }
        }
        URLNormalizerUtils.removeObfuscatedEncoding(toProcess, new EncodingType[] { EncodingType.URI_BASE });

    }

    public boolean isURL() {
        return internalURI == null ? false : internalURI.isUrl;
    }

    public boolean isNormalized() {
        return internalURI == null ? false : internalURI.processed;
    }

    public void normalize()
            throws URLMalformedInputException, URLHostParseException, URLParseException, URIException {
        processSchemeOrHost();
        processURL();
    }

    public String getNormalized() {
        return internalURI.getURIasString();
    }

    public String getOriginalURL() {
        return original;
    }

    public String getPath() {
        return internalURI.path == null ? internalURI.hierPart.toString() : internalURI.path;
    }

    public int getPort() {
        return internalURI == null ? -1 : internalURI.port;
    }

    public String getProtocol() {
        return internalURI.scheme;
    }

    public String getQuery() {
        if (internalURI == null || internalURI.query == null) {
            return null;
        }
        return internalURI.query.substring(1);
    }

    public String getUserInfo() {
        if (internalURI.userInfo != null)
            return internalURI.userInfo;
        return "";
    }

    public boolean hasHostName() {
        if (internalURI != null && internalURI.processed && internalURI.isUrl)
            return !(URLNormalizerUtils.ipv4.matcher(internalURI.host).matches()
                    || URLNormalizerUtils.ipv6.matcher(internalURI.host).matches()
                    || URLNormalizerUtils.ipv6v1.matcher(internalURI.host).matches()
                    || URLNormalizerUtils.ipv6v4normalized.matcher(internalURI.host).matches());
        return false;
    }

    public String getFragment() {
        if (!isURL()) {
            return "";
        }
        return internalURI.fragment == null ? "" : internalURI.fragment;
    }

    public String getHost() {
        if (internalURI == null) {
            return "";
        }
        return internalURI.host == null ? "" : internalURI.host;
    }

    public String getTLD() {
        if (!hasHostName()) {
            return "";
        }
        int last = internalURI.host.lastIndexOf('.');
        if (last >= 0) {
            return internalURI.host.substring(last + 1);
        }
        return internalURI.host;
    }

    public String getSLD() {
        if (!hasHostName()) {
            return "";
        }
        int last = URLNormalizerUtils.findLastMatch(internalURI.host, ".", 0, internalURI.host.length());
        if (last <= 0) {
            return "";
        }
        last = URLNormalizerUtils.findLastMatch(internalURI.host, ".", 0, last);
        return internalURI.host.substring(last + 1);

    }

    private void processURL()
            throws URLMalformedInputException, URLHostParseException, URLParseException, URIException {
        if (internalURI.processed) {
            return;
        }
        LOG.debug("Processing input: '{}'", toProcess.toString());
        if (toProcess.length() == 0) {
            LOG.warn("There is nothing to parse!");
            throw new URLMalformedInputException("Cannot process URL:" + original);

        }
        int i = URLNormalizerUtils.findFirstMatch(toProcess, ".[/?#:@", 0);
        if (i < 0) {
            if (internalURI.host != null) {
                throw new URLMalformedInputException("Cannot parse path,query,fragment:" + toProcess.toString());
            } else {
                String h = null;
                try {
                    h = URLNormalizerUtils.decodeIPv4(toProcess, 0, toProcess.length());
                } catch (URLHostParseException e) {
                    LOG.debug("Not an IPv4:'{}', trying DNS.", toProcess.toString());
                }
                if (h == null) {
                    h = URLNormalizerUtils.dnsToIDN(toProcess, 0, toProcess.length());
                }
                internalURI.host = h;
                internalURI.path = "/";
                internalURI.processed = true;
                toProcess.delete(0, toProcess.length());
                return;

            }
        }

        switch (toProcess.codePointAt(i)) {
        case '.':
            int m = URLNormalizerUtils.findFirstMatch(toProcess, "@/:?#", i + 1);
            if (m < 0) {
                m = toProcess.length();
            } else if (toProcess.codePointAt(m) == '@') {
                internalURI.userInfo = URLNormalizerUtils.normalizeUserInfo(toProcess, 0, m);
                toProcess.delete(0, internalURI.userInfo.length() + 1);
                break;
            }
            String host = null;
            try {
                host = URLNormalizerUtils.decodeIPv4(toProcess, 0, m);
            } catch (URLHostParseException e) {
                LOG.debug("Not an IPv4:'{}', trying DNS.", toProcess.toString());
            }
            if (host == null) {
                host = URLNormalizerUtils.dnsToIDN(toProcess, 0, m);
            }
            internalURI.host = host;
            toProcess.delete(0, host.length());
            if (toProcess.length() == 0) {
                internalURI.path = "/";
                internalURI.processed = true;
            }
            break;
        case '/':
            if (i == 0 && URLNormalizerUtils.findFirstMatch(toProcess, "/", i + 1) == 1) {
                toProcess.delete(i, 2);
                break;
            } else if (internalURI.host == null && i == 0) {
                throw new URLMalformedInputException("Cannot determine host:" + original);
            }
            if (internalURI.host == null) {
                String s = null;
                try {
                    s = URLNormalizerUtils.decodeIPv4(toProcess, 0, i);
                } catch (URLHostParseException e) {
                    //ignore
                }
                if (s == null) {
                    s = URLNormalizerUtils.dnsToIDN(toProcess, 0, i);
                }
                internalURI.host = s;
                toProcess.delete(0, s.length());
                break;
            }
            int tmp = URLNormalizerUtils.findFirstMatch(toProcess, "?#", 0);
            if (tmp < 0) {
                tmp = toProcess.length();
            }
            internalURI.path = URLNormalizerUtils.normlizePath(toProcess, 0, tmp);
            toProcess.delete(0, internalURI.path.length());
            if (toProcess.length() == 0) {
                internalURI.processed = true;
            }
            break;
        case '[': {
            internalURI.host = URLNormalizerUtils.decodeIPv6(toProcess);
            toProcess.delete(0, internalURI.host.length());
            if (toProcess.length() == 0) {
                internalURI.path = "/";
                internalURI.processed = true;
            }
        }
            break;
        case '?': {
            if (internalURI.path == null) {
                internalURI.path = "/";
            }
            internalURI.query = URLNormalizerUtils.normalizeQuery(toProcess, 0, toProcess.length());
            toProcess.delete(0, internalURI.query.length());
            if (toProcess.length() == 0) {
                internalURI.processed = true;
            }
        }
            break;
        case '@':
            internalURI.userInfo = URLNormalizerUtils.normalizeUserInfo(toProcess, 0, i);
            toProcess.delete(0, internalURI.userInfo.length() + 1);
            break;
        case '#':
            if (internalURI.path == null) {
                internalURI.path = "/";
            }
            internalURI.fragment = URLNormalizerUtils.normalizeFragment(toProcess, 0, toProcess.length());
            toProcess.delete(0, toProcess.length());
            internalURI.processed = true;
            break;
        case ':':
            if (internalURI.host != null) {
                if (i == toProcess.length() - 1) {
                    internalURI.path = "/";
                }
                int end = URLNormalizerUtils.findFirstMatch(toProcess, "/?#", i);
                if (end < 0) {
                    end = toProcess.length();
                }
                int port = -1;
                if (i + 1 < end) {
                    port = Integer.parseInt(toProcess.substring(i + 1, end));
                }
                internalURI.port = port;
                toProcess.delete(0, end);
                if (toProcess.length() == 0) {
                    internalURI.processed = true;
                    internalURI.path = "/";
                }

            } else {
                if (i > 0) {
                    int end = URLNormalizerUtils.findFirstMatch(toProcess, "/@?", i);
                    if (end > 0 && toProcess.codePointAt(end) == '@') {
                        internalURI.userInfo = URLNormalizerUtils.normalizeUserInfo(toProcess, 0, end);
                        toProcess.delete(0, internalURI.userInfo.length() + 1);
                        break;
                    }
                    String h = URLNormalizerUtils.numToIPv4(toProcess, 0, i);
                    if (h == null) {
                        h = URLNormalizerUtils.dnsToIDN(toProcess, 0, i);
                    }
                    internalURI.host = h;
                    toProcess.delete(0, h.length());
                    break;
                }
                int u = URLNormalizerUtils.findFirstMatch(toProcess, "@", i);
                if (u < i) {
                    throw new URLMalformedInputException("Cannot process userinfo");
                }
                internalURI.userInfo = URLNormalizerUtils.normalizeUserInfo(toProcess, 0, u);
                toProcess.delete(0, internalURI.userInfo.length() + 1);
            }
            break;
        default:
            if (internalURI.host != null) {
                throw new URLMalformedInputException("Cannot process URL:" + toProcess.toString());
            }
            break;
        }

        processURL();

    }

    //access for tests only
    void processSchemeOrHost() throws URLMalformedInputException, URLHostParseException {
        LOG.debug("Extracting scheme from input: {}", toProcess.toString());
        if (toProcess.length() == 0) {
            throw new URLMalformedInputException("URL for processing cannot be empty");
        }
        int i = URLNormalizerUtils.findFirstMatch(toProcess, ":[]@./?#", 0);
        if (i == 0 && !(toProcess.codePointAt(i) == '[' || toProcess.codePointAt(i) == ':'
                || toProcess.codePointAt(i) == '.')) {
            throw new URLMalformedInputException();
        }
        // numeric IP or hostname
        if (i < 0) {
            String ip = null;
            try {
                ip = URLNormalizerUtils.decodeIPv4(toProcess, 0, toProcess.length());
            } catch (URLHostParseException e) {
                //ignore
            }
            this.internalURI = new URI();
            if (ip == null) {
                ip = URLNormalizerUtils.dnsToIDN(toProcess);
            }
            internalURI.scheme = DEFAULT_SCHEMA;
            internalURI.isUrl = true;
            internalURI.host = ip;
            internalURI.path = "/";
            internalURI.processed = true;
            return;
        }

        switch (toProcess.codePointAt(i)) {
        case ':':
            if (i > 0) {
                internalURI = new URI(toProcess, i);
            } else {
                internalURI = new URI();
                internalURI.scheme = DEFAULT_SCHEMA;
                internalURI.isUrl = true;
                internalURI.host = URLNormalizerUtils.decodeIPv6(toProcess);
                if (toProcess.length() == toProcess.indexOf("]") + 1) {
                    internalURI.processed = true;
                    internalURI.path = "/";
                } else {
                    int rem = toProcess.indexOf("]");
                    toProcess.delete(0, rem + 1);
                }
            }
            break;

        case '[':
            int clBr = URLNormalizerUtils.findFirstMatch(toProcess, "]", i);
            if (i > 0 || clBr < 0) {
                throw new URLMalformedInputException("Cannot find matched bracket for IPv6");
            }
            StringBuilder sb = new StringBuilder(toProcess.substring(i, clBr + 1));
            if (internalURI == null)
                internalURI = new URI();
            internalURI.host = URLNormalizerUtils.decodeIPv6(sb);
            internalURI.scheme = DEFAULT_SCHEMA;
            internalURI.isUrl = true;
            if (toProcess.indexOf("]") == toProcess.length() - 1) {
                internalURI.path = "/";
                internalURI.processed = true;
            } else {
                int rem = toProcess.indexOf("]");
                toProcess.delete(i, rem + 1);
            }
            break;

        case '@':
            internalURI = new URI();
            internalURI.scheme = DEFAULT_SCHEMA;
            try {
                internalURI.userInfo = URLNormalizerUtils.normalizeUserInfo(toProcess, 0, i);
            } catch (URIException e) {
                throw new URLMalformedInputException("Cannot process userinfo:" + toProcess.substring(0, i), e);
            }
            toProcess.delete(0, i + 1);
            break;

        case '.':
            String delim = "/?#:";
            int hEnd = URLNormalizerUtils.findFirstMatch(toProcess, delim, i);
            if (hEnd < 0) {
                hEnd = toProcess.length();
            }
            String enc;
            if (URLNormalizerUtils.dnsEnd.matcher(toProcess.substring(0, hEnd)).matches()
                    || toProcess.codePointAt(hEnd - 1) == '.') {
                enc = URLNormalizerUtils.dnsToIDN(toProcess, 0, hEnd);
            } else {
                enc = URLNormalizerUtils.decodeIPv4(toProcess, 0, hEnd);
            }
            internalURI = new URI();
            internalURI.scheme = DEFAULT_SCHEMA;
            internalURI.isUrl = true;
            internalURI.host = enc;
            int rem = URLNormalizerUtils.findFirstMatch(toProcess, delim, 0);
            if (rem < 0) {
                internalURI.path = "/";
                internalURI.processed = true;
            } else {
                toProcess.delete(0, rem);
            }
            break;
        //      either numericIP or hostname with path
        case '/':
            String ip = null;
            if (i < 2 || (toProcess.length() > i + 1 && toProcess.codePointAt(i + 1) == '/')) {
                throw new URLMalformedInputException("Cannot process URL:" + toProcess.toString());
            }
            try {
                ip = URLNormalizerUtils.decodeIPv4(toProcess, 0, i);
            } catch (URLHostParseException e) {
                //ignore
            }
            if (this.internalURI == null) {
                this.internalURI = new URI();
            }
            if (ip == null) {
                ip = URLNormalizerUtils.dnsToIDN(toProcess, 0, i);
            }
            internalURI.scheme = DEFAULT_SCHEMA;
            internalURI.isUrl = true;
            internalURI.host = ip;
            int toDel = toProcess.indexOf("/");
            toProcess.delete(0, toDel);
            break;
        default:
            throw new URLHostParseException("Cannot normalize:" + toProcess.toString());

        }

    }

    public static class URI {
        public boolean processed;
        boolean isUrl = false;

        URI() {
        }

        URI(StringBuilder sb, int end) {
            //          according to RFC scheme part is case-insensitive but is different from datacontract.
            //         this.scheme = sb.substring(0,div).toLowerCase();
            this.scheme = URLNormalizerUtils.removeObfuscatedEncoding(sb, 0, end,
                    new EncodingType[] { EncodingType.SCHEME_ALLOWED });
            int delIndx = this.scheme.length();
            if (scheme.equalsIgnoreCase(DEFAULT_SCHEMA) || scheme.equalsIgnoreCase("https")) {
                scheme = scheme.toLowerCase();
                sb.delete(0, delIndx + 1);
                isUrl = true;
            } else {
                processed = true;
                this.hierPart = new StringBuilder(sb.subSequence(delIndx + 1, sb.length()));
            }

        }

        String scheme;
        StringBuilder hierPart;
        String path;
        String host;
        String userInfo;
        int port = -1;
        String query;
        String fragment;

        public String getURIasString() {
            if (!processed) {
                return null;
            }
            if (scheme.equalsIgnoreCase(DEFAULT_SCHEMA) || scheme.equalsIgnoreCase("https")) {
                StringBuilder sb = new StringBuilder();
                sb.append(scheme).append("://");
                if (userInfo != null) {
                    sb.append(userInfo).append("@");
                }
                sb.append(host);
                if (port > 0) {
                    sb.append(":").append(port);
                }
                sb.append(path);
                if (query != null) {
                    sb.append(query);
                }
                if (fragment != null) {
                    sb.append(fragment);
                }
                return sb.toString();
            }
            return scheme + ":" + hierPart.toString();
        }

    }

}