Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.utilities.web; import nu.validator.htmlparser.dom.HtmlDocumentBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.xml.SimpleNamespaceContext; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import javax.xml.parsers.DocumentBuilder; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.Reader; import java.io.StringReader; import static javax.xml.xpath.XPathConstants.NODESET; /** * HTML utilities. * * @author Pascal S. de Kloe */ public class WebPages { private static final Logger logger = LoggerFactory.getLogger(WebPages.class); private DocumentBuilder htmlParser; private XPath xPathEngine; /** * Gets the textual content. * * @param html the markup fragment. */ public static String getText(Node html) { if (html.getNodeType() == Node.ELEMENT_NODE) { logger.trace("Traveling element node"); StringBuilder text = new StringBuilder(); NodeList children = html.getChildNodes(); int count = children.getLength(); logger.debug(String.format("Searching for text in %d elements", count)); for (int i = 0; i < count; ++i) text.append(getText(children.item(i))); return text.toString(); } String result = html.getNodeValue(); return result == null ? "" : result; } /** * Concatenates the {@link #getText(org.w3c.dom.Node) textual content}. * * @param html the markup fragments. * @param separator the optional join characters. */ public static String joinText(NodeList html, String separator) { return joinText(convertToArray(html), separator); } public static String joinText(Node[] html, String separator) { int count = html.length; if (logger.isDebugEnabled()) { String msg = "Joining %d nodes with '%s'"; logger.debug(String.format(msg, count, separator)); } StringBuilder text = new StringBuilder(); for (int i = 0; i < count; ++i) { String addition = getText(html[i]); if (addition.isEmpty()) continue; if (text.length() != 0 && separator != null) text.append(separator); text.append(addition); } return text.toString(); } private static Node[] convertToArray(NodeList list) { Node[] copy = new Node[list.getLength()]; for (int i = 0; i < list.getLength(); i++) { copy[i] = list.item(i); } return copy; } public Selector selector(Object data) { if (data instanceof Node) { return new CssSelector((Node) data); } else if (data instanceof String) { return new CssSelector(parse((String) data)); } else { return new NullSelector(); } } /** * Builds a DOM. * * @param html the serialized markup. */ public Document parse(String html) { DocumentBuilder parser = htmlParser; if (parser == null) { parser = new HtmlDocumentBuilder(); htmlParser = parser; } try { Reader reader = new StringReader(html); return parser.parse(new InputSource(reader)); } catch (Exception e) { logger.error("Can't parse HTML", e); return null; } } /** * Evaluates an XPath. * * @param html the markup root. * @param xPath the expression. */ public NodeList findNodes(Node html, String xPath) throws XPathExpressionException { XPath engine = getXPathEngine(); NodeList result = (NodeList) engine.evaluate(xPath, html, NODESET); if (logger.isDebugEnabled()) logger.debug(String.format("XPath %s gave %d nodes", xPath, result.getLength())); return result; } private XPath getXPathEngine() { XPath engine = xPathEngine; if (engine == null) { XPathFactory factory = XPathFactory.newInstance(); engine = factory.newXPath(); SimpleNamespaceContext namespaces = new SimpleNamespaceContext(); namespaces.bindNamespaceUri("ht", "http://www.w3.org/1999/xhtml"); engine.setNamespaceContext(namespaces); xPathEngine = engine; } return engine; } }