com.seajas.search.contender.service.exploration.ExplorationService.java Source code

Introduction

Here is the source code for com.seajas.search.contender.service.exploration.ExplorationService.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.service.exploration;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.htmlcleaner.CustomDomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * A simple exploration service, which scans for any indirectly accessible feed links.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
@Service
public class ExplorationService {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(ExplorationService.class);

    /**
     * Make an attempt to retrieve all feed links from the given HTML content.
     * 
     * @param htmlContent
     * @return List<String>
     */
    public List<String> getIndirectlyAccessibleFeedLinks(final URI uri, final String htmlContent) {
        List<String> result = new ArrayList<String>();

        try {
            HtmlCleaner cleaner = new HtmlCleaner();

            TagNode node = cleaner.clean(htmlContent);

            Document document = new CustomDomSerializer(cleaner.getProperties(), true).createDOM(node);

            // Now try to extract the appropriate links

            XPath xpath = XPathFactory.newInstance().newXPath();

            try {
                XPathExpression xpathExpression = xpath.compile(
                        "//head/link[contains(translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss+xml') or contains(translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'atom+xml')]/@href");

                NodeList nodeList = (NodeList) xpathExpression.evaluate(document, XPathConstants.NODESET);

                for (int i = 0; i < nodeList.getLength(); i++) {
                    Node listNode = nodeList.item(i);

                    if (listNode instanceof Attr) {
                        String resultUrl = ((Attr) listNode).getValue();

                        if (!StringUtils.hasText(resultUrl)) {
                            logger.warn("The given alternate-link tag contains no href - skipping");

                            continue;
                        }

                        try {
                            new URL(resultUrl.trim());

                            result.add(resultUrl.trim());
                        } catch (MalformedURLException e) {
                            try {
                                result.add(uri.resolve(resultUrl.trim()).normalize().toString());
                            } catch (IllegalArgumentException e2) {
                                logger.warn(
                                        "The given (presumably relative) URL is not valid - not adding to the result list",
                                        e2);
                            }
                        }
                    } else
                        logger.error("Invalid node type " + listNode.getNodeType() + " - skipping");
                }
            } catch (XPathExpressionException e) {
                logger.error("Could not apply the given XPath expression to extract RSS alternate links", e);
            }

            // Now determine if the URLs are fully-qualified

        } catch (ParserConfigurationException e) {
            logger.info("Could not serialize the given content", e);

            return null;
        }

        return result;
    }
}