com.seajas.search.utilities.web.WebResourceLocators.java Source code

Introduction

Here is the source code for com.seajas.search.utilities.web.WebResourceLocators.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.utilities.web;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;

/**
 * Internet location routines.
 * 
 * @author Pascal S. de Kloe <pascal@quies.net>
 */
public final class WebResourceLocators {
    /**
     * Logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(WebResourceLocators.class);

    /**
     * The ASCII characters which should always be percent encoded.
     */
    private static final String ASCII_ESCAPE_SET = ",'\" `";

    /**
     * Percent escaping map.
     */
    private static final Map<String, String> xmlEntityPercentEscapes = getXmlEntityPercentEscapes();

    /**
     * Default constructor.
     */
    private WebResourceLocators() {
    }

    /**
     * Gets a resource locator.
     * 
     * @param uri
     * @throws URISyntaxException
     */
    public static URI parseURI(String uri) throws URISyntaxException {
        if (!StringUtils.hasText(uri))
            throw new URISyntaxException("", "No content");
        uri = uri.trim();

        // Escape illegal characters

        for (int i = 0; i < uri.length(); ++i) {
            char c = uri.charAt(i);
            if (ASCII_ESCAPE_SET.indexOf(c) >= 0)
                uri = String.format("%s%%%02X%s", uri.substring(0, i), c & 0xFF, uri.substring(i + 1));
        }

        // Parse XML entities

        for (int start = uri.indexOf('&'); start >= 0; start = uri.indexOf('&', start + 1)) {
            int end = uri.indexOf(';', start);
            if (end < 0)
                break;
            String entity = uri.substring(start + 1, end);

            String replacement = null;
            if (entity.startsWith("#")) {
                try {
                    int codepoint = entity.startsWith("#x") ? Integer.parseInt(entity.substring(2), 16)
                            : Integer.parseInt(entity.substring(1));
                    if (codepoint <= 0xFF)
                        replacement = String.format("%%%02X", codepoint);
                    else
                        replacement = new String(Character.toChars(codepoint));
                } catch (Exception e) {
                    logger.trace("Unparseable numeric entity.", e);
                }
            } else {
                replacement = xmlEntityPercentEscapes.get(entity);
            }

            if (replacement != null) {
                if (logger.isDebugEnabled())
                    logger.debug(String.format("Replaced entity %s with %s for %s", entity, replacement, uri));
                uri = uri.substring(0, start) + replacement + uri.substring(end + 1);
            } else if (logger.isDebugEnabled()) {
                logger.debug(String.format("Skiped entity %s for %s", entity, uri));
            }
        }

        // Multiple fragments

        while (uri.indexOf('#') != uri.lastIndexOf('#')) {
            int i = uri.lastIndexOf('#');

            // Escape last hash mark

            uri = uri.substring(0, i) + "%23" + uri.substring(i + 1);
        }

        return new URI(uri).normalize();
    }

    /**
     * Gets a resource locator. The optional base URIs are used to resolve relative paths in order of appearance.
     * 
     * @param uri
     *                the serialized form.
     * @param baseURIs
     *                the serialized forms.
     * @throws URISyntaxException
     *                 when all interpretation attempts have failed.
     */
    public static URI parseURI(final String uri, final String... baseURIs) throws URISyntaxException {
        URI result = parseURI(uri);

        if (!result.isAbsolute() && baseURIs != null) {
            for (String base : baseURIs) {
                try {
                    result = parseURI(base).resolve(result);

                    if (result.isAbsolute())
                        break;
                } catch (URISyntaxException e) {
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("Skipping unparseable base URI %s for %s", base, uri), e);
                }
            }
        }

        return result;
    }

    private static Map<String, String> getXmlEntityPercentEscapes() {
        Map<String, String> map = new HashMap<String, String>();
        map.put("amp", "%26");
        map.put("lt", "%3C");
        map.put("gt", "%3E");
        map.put("apos", "%27");
        map.put("quot", "%22");
        return map;
    }
}