Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.jboss.elasticsearch.river.remote.sitemap; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; import java.util.zip.GZIPInputStream; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.commons.io.input.BOMInputStream; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.jboss.elasticsearch.river.remote.sitemap.AbstractSiteMap.SitemapType; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * @author http://code.google.com/p/crawler-commons */ public class SiteMapParser { private static final ESLogger logger = Loggers.getLogger(SiteMapParser.class); /** According to the specs, 50K URLs per Sitemap is the max */ private static final int MAX_URLS = 50000; /** Sitemap docs must be limited to 10MB (10,485,760 bytes) */ public static int MAX_BYTES_ALLOWED = 10485760; /** True (by default) if invalid URLs should be rejected */ private boolean strict; public SiteMapParser() { this(true); } public SiteMapParser(boolean strict) { this.strict = strict; } /** * @return whether invalid URLs will be rejected */ public boolean isStrict() { return strict; } /** * Returns a SiteMap or SiteMapIndex given a content type, byte content and the URL of a sitemap */ public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException { // Use extension or MIME type to determine how we should try // to process the response if (url.getPath().endsWith(".xml") || contentType.contains("text/xml") || contentType.contains("application/xml") || contentType.contains("application/x-xml") || contentType.contains("application/atom+xml") || contentType.contains("application/rss+xml")) { // Try parsing the XML which could be in a number of formats return processXml(url, content); } else if (url.getPath().endsWith(".txt") || contentType.contains("text/plain")) { // plain text return (AbstractSiteMap) processText(content, url.toString()); } else if (url.getPath().endsWith(".gz") || contentType.contains("application/gzip") || contentType.contains("application/x-gzip") || contentType.contains("application/x-gunzip") || contentType.contains("application/gzipped") || contentType.contains("application/gzip-compressed") || contentType.contains("application/x-compress") || contentType.contains("gzip/document") || contentType.contains("application/octet-stream")) { return processGzip(url, content); } throw new UnknownFormatException("Unknown format " + contentType + " at " + url); } /** * Parse the given XML content. * * @param sitemapUrl * @param xmlContent * @return * @throws UnknownFormatException */ private AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException { BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(xmlContent)); InputSource is = new InputSource(); is.setCharacterStream(new BufferedReader(new InputStreamReader(bomIs))); return processXml(sitemapUrl, is); } /** * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc. * * @param content * @throws IOException */ private SiteMap processText(byte[] content, String sitemapUrl) throws IOException { logger.debug("Processing textual Sitemap"); SiteMap textSiteMap = new SiteMap(sitemapUrl); textSiteMap.setType(SitemapType.TEXT); BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content)); @SuppressWarnings("resource") BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs)); String line; int i = 1; while ((line = reader.readLine()) != null) { if (line.length() > 0 && i <= MAX_URLS) { try { URL url = new URL(line); boolean valid = urlIsLegal(textSiteMap.getBaseUrl(), url.toString()); if (valid || !strict) { if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(" "); sb.append(i).append(". ").append(url); logger.debug(sb.toString()); } i++; SiteMapURL surl = new SiteMapURL(url, valid); textSiteMap.addSiteMapUrl(surl); } } catch (MalformedURLException e) { logger.debug("Bad URL [" + line + "]."); } } } textSiteMap.setProcessed(true); return textSiteMap; } /** * Decompress the gzipped content and process the resulting XML Sitemap. * * @param url - URL of the gzipped content * @param response - Gzipped content * @throws MalformedURLException * @throws IOException * @throws UnknownFormatException */ private AbstractSiteMap processGzip(URL url, byte[] response) throws MalformedURLException, IOException, UnknownFormatException { logger.debug("Processing gzip"); AbstractSiteMap smi; InputStream is = new ByteArrayInputStream(response); // Remove .gz ending String xmlUrl = url.toString().replaceFirst("\\.gz$", ""); logger.debug("XML url = " + xmlUrl); BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is)); InputSource in = new InputSource(decompressed); in.setSystemId(xmlUrl); smi = processXml(url, in); decompressed.close(); return smi; } /** * Parse the given XML content. * * @param sitemapUrl * @param is * @throws UnknownFormatException */ private AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException { Document doc = null; try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); db.setErrorHandler(new ErrorHandler() { @Override public void warning(SAXParseException exception) throws SAXException { logger.warn("Sitemap XML warning: " + exception.getMessage()); } @Override public void fatalError(SAXParseException exception) throws SAXException { logger.warn("Sitemap XML fatalError: " + exception.getMessage()); } @Override public void error(SAXParseException exception) throws SAXException { logger.warn("Sitemap XML error: " + exception.getMessage()); } }); doc = db.parse(is); } catch (Exception e) { throw new UnknownFormatException("Error parsing XML for " + sitemapUrl); } // See if this is a sitemap index NodeList nodeList = doc.getElementsByTagName("sitemapindex"); if (nodeList.getLength() > 0) { nodeList = doc.getElementsByTagName("sitemap"); return parseSitemapIndex(sitemapUrl, nodeList); } else if (doc.getElementsByTagName("urlset").getLength() > 0) { // This is a regular Sitemap return parseXmlSitemap(sitemapUrl, doc); } else if (doc.getElementsByTagName("link").getLength() > 0) { // Could be RSS or Atom return parseSyndicationFormat(sitemapUrl, doc); } throw new UnknownFormatException("Unknown XML format for " + sitemapUrl); } /** * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0" encoding="UTF-8"?> <urlset * xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> * <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc * >http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc> <changefreq>weekly</changefreq> </url> * </urlset> * * @param doc */ private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) { SiteMap sitemap = new SiteMap(sitemapUrl); sitemap.setType(SitemapType.XML); NodeList list = doc.getElementsByTagName("url"); // Loop through the <url>s for (int i = 0; i < list.getLength(); i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) n; String loc = getElementValue(elem, "loc"); URL url = null; try { url = new URL(loc); String lastMod = getElementValue(elem, "lastmod"); String changeFreq = getElementValue(elem, "changefreq"); String priority = getElementValue(elem, "priority"); boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString()); if (valid || !strict) { SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid); sitemap.addSiteMapUrl(sUrl); if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(" "); sb.append(i + 1).append(". ").append(sUrl); logger.debug(sb.toString()); } } } catch (MalformedURLException e) { // e.printStackTrace(); // Can't create an entry with a bad URL logger.debug("Bad url: [" + loc + "]"); } } } sitemap.setProcessed(true); return sitemap; } /** * Parse XML that contains a Sitemap Index. Example Sitemap Index: * * <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> * <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> <sitemap> * <loc>http://www.example.com/sitemap2.xml.gz</loc> <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex> * * @param url - URL of Sitemap Index * @param nodeList */ private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) { logger.debug("Parsing Sitemap Index"); SiteMapIndex sitemapIndex = new SiteMapIndex(url); sitemapIndex.setType(SitemapType.INDEX); // Loop through the <sitemap>s for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) { Node firstNode = nodeList.item(i); URL sitemapUrl = null; Date lastModified = null; if (firstNode.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) firstNode; String loc = getElementValue(elem, "loc"); // try the text content when no loc element // has been specified if (loc == null) { loc = elem.getTextContent().trim(); } try { sitemapUrl = new URL(loc); String lastmod = getElementValue(elem, "lastmod"); lastModified = SiteMap.convertToDate(lastmod); // Right now we are not worried about sitemapUrls that point // to different websites. SiteMap s = new SiteMap(sitemapUrl, lastModified); sitemapIndex.addSitemap(s); if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(" "); sb.append(i + 1).append(". ").append(s); logger.debug(sb.toString()); } } catch (MalformedURLException e) { // e.printStackTrace(); // Don't create an entry for a bad URL logger.debug("Bad url: [" + loc + "]"); } } } sitemapIndex.setProcessed(true); return sitemapIndex; } /** * Parse the XML document, looking for "feed" element to determine if it's an Atom doc and "rss" to determine if it's * an RSS doc. * * @param sitemapUrl * @param doc - XML document to parse * @throws UnknownFormatException if XML does not appear to be Arom or RSS */ private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException { SiteMap sitemap = new SiteMap(sitemapUrl); // See if this is an Atom feed by looking for "feed" element NodeList list = doc.getElementsByTagName("feed"); if (list.getLength() > 0) { parseAtom(sitemap, (Element) list.item(0), doc); sitemap.setProcessed(true); return sitemap; } else { // See if RSS feed by looking for "rss" element list = doc.getElementsByTagName("rss"); if (list.getLength() > 0) { parseRSS(sitemap, doc); sitemap.setProcessed(true); return sitemap; } else { throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl); } } } /** * Parse the XML document which is assumed to be in Atom format. Atom 1.0 example: * * <?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> * * <title>Example Feed</title> <subtitle>A subtitle.</subtitle> <link href="http://example.org/feed/" rel="self"/> * <link href="http://example.org/"/> <modified>2003-12-13T18:30:02Z</modified> <author> <name>John Doe</name> * <email>johndoe@example.com</email> </author> <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id> * * <entry> <title>Atom-Powered Robots Run Amok</title> <link href="http://example.org/2003/12/13/atom03"/> * <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> <summary>Some * text.</summary> </entry> * * </feed> * * @param elem * @param doc */ private void parseAtom(SiteMap sitemap, Element elem, Document doc) { // Grab items from <feed><entry><link href="URL" /></entry></feed> // Use lastmod date from <feed><modified>DATE</modified></feed> logger.debug("Parsing Atom XML"); sitemap.setType(SitemapType.ATOM); String lastMod = getElementValue(elem, "modified"); logger.debug("lastMod=" + lastMod); NodeList list = doc.getElementsByTagName("entry"); // Loop through the <entry>s for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { elem = (Element) n; String href = getElementAttributeValue(elem, "link", "href"); logger.debug("href=" + href); URL url = null; try { url = new URL(href); boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString()); if (valid || !strict) { SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid); sitemap.addSiteMapUrl(sUrl); if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(" "); sb.append(i + 1).append(". ").append(sUrl); logger.debug(sb.toString()); } } } catch (MalformedURLException e) { // Can't create an entry with a bad URL logger.debug("Bad url: [" + href + "]"); } } } } /** * Parse XML document which is assumed to be in RSS format. RSS 2.0 example: * * <?xml version="1.0"?> <rss version="2.0"> <channel> <title>Lift Off News</title> * <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space Exploration.</description> * <language>en-us</language> <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate> <lastBuildDate>Tue, 10 Jun 2003 * 09:41:01 GMT</lastBuildDate> <docs>http://blogs.law.harvard.edu/tech/rss</docs> <generator>Weblog Editor * 2.0</generator> <managingEditor>editor@example.com</managingEditor> <webMaster>webmaster@example.com</webMaster> * <ttl>5</ttl> * * <item> <title>Star City</title> <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> * <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a * crash course in culture, language and protocol at Russia's Star City.</description> <pubDate>Tue, 03 Jun 2003 * 09:39:21 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item> * * <item> <title>Space Exploration</title> <link>http://liftoff.msfc.nasa.gov/</link> <description>Sky watchers in * Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May * 31.</description> <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate> * <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> </item> * * </channel> </rss> * * @param sitemap * @param doc */ private void parseRSS(SiteMap sitemap, Document doc) { // Grab items from <item><link>URL</link></item> // and last modified date from <pubDate>DATE</pubDate> logger.debug("Parsing RSS doc"); sitemap.setType(SitemapType.RSS); NodeList list = doc.getElementsByTagName("channel"); Element elem = (Element) list.item(0); // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT) String lastMod = getElementValue(elem, "pubDate"); logger.debug("lastMod=" + lastMod); list = doc.getElementsByTagName("item"); // Loop through the <item>s for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { elem = (Element) n; String link = getElementValue(elem, "link"); logger.debug("link=" + link); try { URL url = new URL(link); boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString()); if (valid || !strict) { SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid); sitemap.addSiteMapUrl(sUrl); if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(" "); sb.append(i + 1).append(". ").append(sUrl); logger.debug(sb.toString()); } } } catch (MalformedURLException e) { // Can't create an entry with a bad URL logger.debug("Bad url: [" + link + "]"); } } } } /** * Get the element's textual content. * * @param elem * @param elementName * @return */ private String getElementValue(Element elem, String elementName) { NodeList list = elem.getElementsByTagName(elementName); Element e = (Element) list.item(0); if (e != null) { NodeList children = e.getChildNodes(); if (children.item(0) != null) { return ((Node) children.item(0)).getNodeValue().trim(); } } return null; } /** * Get the element's attribute value. * * @param elem * @param elementName * @param attributeName * @return */ private String getElementAttributeValue(Element elem, String elementName, String attributeName) { NodeList list = elem.getElementsByTagName(elementName); Element e = (Element) list.item(0); if (e != null) { return e.getAttribute(attributeName); } return null; } /** * See if testUrl is under sitemapUrl. Only URLs under sitemapUrl are legal. Both URLs are first converted to * lowercase before the comparison is made (this could be an issue on web servers that are case sensitive). * * @param sitemapUrl * @param testUrl * @return true if testUrl is under sitemapUrl, false otherwise */ protected static boolean urlIsLegal(String sitemapBaseUrl, String testUrl) { boolean ret = false; // Don't try a comparison if the URL is too short to match if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) { String u = testUrl.substring(0, sitemapBaseUrl.length()).toLowerCase(); ret = sitemapBaseUrl.toLowerCase().equals(u); } if (logger.isTraceEnabled()) { StringBuffer sb = new StringBuffer("urlIsLegal: "); sb.append(sitemapBaseUrl).append(" <= ").append(testUrl); sb.append(" ? ").append(ret); logger.trace(sb.toString()); } return ret; } }