Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.service.exploration; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.htmlcleaner.CustomDomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * A simple exploration service, which scans for any indirectly accessible feed links. * * @author Jasper van Veghel <jasper@seajas.com> */ @Service public class ExplorationService { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(ExplorationService.class); /** * Make an attempt to retrieve all feed links from the given HTML content. * * @param htmlContent * @return List<String> */ public List<String> getIndirectlyAccessibleFeedLinks(final URI uri, final String htmlContent) { List<String> result = new ArrayList<String>(); try { HtmlCleaner cleaner = new HtmlCleaner(); TagNode node = cleaner.clean(htmlContent); Document document = new CustomDomSerializer(cleaner.getProperties(), true).createDOM(node); // Now try to extract the appropriate links XPath xpath = XPathFactory.newInstance().newXPath(); try { XPathExpression xpathExpression = xpath.compile( "//head/link[contains(translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss+xml') or contains(translate(@type, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'atom+xml')]/@href"); NodeList nodeList = (NodeList) xpathExpression.evaluate(document, XPathConstants.NODESET); for (int i = 0; i < nodeList.getLength(); i++) { Node listNode = nodeList.item(i); if (listNode instanceof Attr) { String resultUrl = ((Attr) listNode).getValue(); if (!StringUtils.hasText(resultUrl)) { logger.warn("The given alternate-link tag contains no href - skipping"); continue; } try { new URL(resultUrl.trim()); result.add(resultUrl.trim()); } catch (MalformedURLException e) { try { result.add(uri.resolve(resultUrl.trim()).normalize().toString()); } catch (IllegalArgumentException e2) { logger.warn( "The given (presumably relative) URL is not valid - not adding to the result list", e2); } } } else logger.error("Invalid node type " + listNode.getNodeType() + " - skipping"); } } catch (XPathExpressionException e) { logger.error("Could not apply the given XPath expression to extract RSS alternate links", e); } // Now determine if the URLs are fully-qualified } catch (ParserConfigurationException e) { logger.info("Could not serialize the given content", e); return null; } return result; } }