org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser.java Source code

Introduction

Here is the source code for org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.jboss.elasticsearch.river.remote.sitemap;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.io.input.BOMInputStream;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.jboss.elasticsearch.river.remote.sitemap.AbstractSiteMap.SitemapType;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

/**
 * @author http://code.google.com/p/crawler-commons
 */
public class SiteMapParser {

    private static final ESLogger logger = Loggers.getLogger(SiteMapParser.class);

    /** According to the specs, 50K URLs per Sitemap is the max */
    private static final int MAX_URLS = 50000;

    /** Sitemap docs must be limited to 10MB (10,485,760 bytes) */
    public static int MAX_BYTES_ALLOWED = 10485760;

    /** True (by default) if invalid URLs should be rejected */
    private boolean strict;

    public SiteMapParser() {
        this(true);
    }

    public SiteMapParser(boolean strict) {
        this.strict = strict;
    }

    /**
     * @return whether invalid URLs will be rejected
     */
    public boolean isStrict() {
        return strict;
    }

    /**
     * Returns a SiteMap or SiteMapIndex given a content type, byte content and the URL of a sitemap
     */
    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url)
            throws UnknownFormatException, IOException {

        // Use extension or MIME type to determine how we should try
        // to process the response
        if (url.getPath().endsWith(".xml") || contentType.contains("text/xml")
                || contentType.contains("application/xml") || contentType.contains("application/x-xml")
                || contentType.contains("application/atom+xml") || contentType.contains("application/rss+xml")) {

            // Try parsing the XML which could be in a number of formats
            return processXml(url, content);
        } else if (url.getPath().endsWith(".txt") || contentType.contains("text/plain")) {
            // plain text
            return (AbstractSiteMap) processText(content, url.toString());
        } else if (url.getPath().endsWith(".gz") || contentType.contains("application/gzip")
                || contentType.contains("application/x-gzip") || contentType.contains("application/x-gunzip")
                || contentType.contains("application/gzipped")
                || contentType.contains("application/gzip-compressed")
                || contentType.contains("application/x-compress") || contentType.contains("gzip/document")
                || contentType.contains("application/octet-stream")) {
            return processGzip(url, content);
        }
        throw new UnknownFormatException("Unknown format " + contentType + " at " + url);
    }

    /**
     * Parse the given XML content.
     * 
     * @param sitemapUrl
     * @param xmlContent
     * @return
     * @throws UnknownFormatException
     */
    private AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {

        BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(xmlContent));
        InputSource is = new InputSource();
        is.setCharacterStream(new BufferedReader(new InputStreamReader(bomIs)));
        return processXml(sitemapUrl, is);
    }

    /**
     * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc.
     * 
     * @param content
     * @throws IOException
     */
    private SiteMap processText(byte[] content, String sitemapUrl) throws IOException {

        logger.debug("Processing textual Sitemap");

        SiteMap textSiteMap = new SiteMap(sitemapUrl);
        textSiteMap.setType(SitemapType.TEXT);

        BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content));
        @SuppressWarnings("resource")
        BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));

        String line;

        int i = 1;
        while ((line = reader.readLine()) != null) {
            if (line.length() > 0 && i <= MAX_URLS) {
                try {
                    URL url = new URL(line);
                    boolean valid = urlIsLegal(textSiteMap.getBaseUrl(), url.toString());

                    if (valid || !strict) {
                        if (logger.isDebugEnabled()) {
                            StringBuffer sb = new StringBuffer("  ");
                            sb.append(i).append(". ").append(url);
                            logger.debug(sb.toString());
                        }
                        i++;
                        SiteMapURL surl = new SiteMapURL(url, valid);
                        textSiteMap.addSiteMapUrl(surl);
                    }
                } catch (MalformedURLException e) {
                    logger.debug("Bad URL [" + line + "].");
                }
            }
        }
        textSiteMap.setProcessed(true);
        return textSiteMap;
    }

    /**
     * Decompress the gzipped content and process the resulting XML Sitemap.
     * 
     * @param url - URL of the gzipped content
     * @param response - Gzipped content
     * @throws MalformedURLException
     * @throws IOException
     * @throws UnknownFormatException
     */
    private AbstractSiteMap processGzip(URL url, byte[] response)
            throws MalformedURLException, IOException, UnknownFormatException {

        logger.debug("Processing gzip");

        AbstractSiteMap smi;

        InputStream is = new ByteArrayInputStream(response);

        // Remove .gz ending
        String xmlUrl = url.toString().replaceFirst("\\.gz$", "");

        logger.debug("XML url = " + xmlUrl);

        BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
        InputSource in = new InputSource(decompressed);
        in.setSystemId(xmlUrl);
        smi = processXml(url, in);
        decompressed.close();
        return smi;
    }

    /**
     * Parse the given XML content.
     * 
     * @param sitemapUrl
     * @param is
     * @throws UnknownFormatException
     */
    private AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {

        Document doc = null;

        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder db = dbf.newDocumentBuilder();
            db.setErrorHandler(new ErrorHandler() {

                @Override
                public void warning(SAXParseException exception) throws SAXException {
                    logger.warn("Sitemap XML warning: " + exception.getMessage());
                }

                @Override
                public void fatalError(SAXParseException exception) throws SAXException {
                    logger.warn("Sitemap XML fatalError: " + exception.getMessage());
                }

                @Override
                public void error(SAXParseException exception) throws SAXException {
                    logger.warn("Sitemap XML error: " + exception.getMessage());

                }
            });
            doc = db.parse(is);
        } catch (Exception e) {
            throw new UnknownFormatException("Error parsing XML for " + sitemapUrl);
        }

        // See if this is a sitemap index
        NodeList nodeList = doc.getElementsByTagName("sitemapindex");
        if (nodeList.getLength() > 0) {
            nodeList = doc.getElementsByTagName("sitemap");
            return parseSitemapIndex(sitemapUrl, nodeList);
        } else if (doc.getElementsByTagName("urlset").getLength() > 0) {
            // This is a regular Sitemap
            return parseXmlSitemap(sitemapUrl, doc);
        } else if (doc.getElementsByTagName("link").getLength() > 0) {
            // Could be RSS or Atom
            return parseSyndicationFormat(sitemapUrl, doc);
        }
        throw new UnknownFormatException("Unknown XML format for " + sitemapUrl);
    }

    /**
     * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0" encoding="UTF-8"?> <urlset
     * xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc>
     * <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc
     * >http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc> <changefreq>weekly</changefreq> </url>
     * </urlset>
     * 
     * @param doc
     */
    private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {

        SiteMap sitemap = new SiteMap(sitemapUrl);
        sitemap.setType(SitemapType.XML);

        NodeList list = doc.getElementsByTagName("url");

        // Loop through the <url>s
        for (int i = 0; i < list.getLength(); i++) {

            Node n = list.item(i);

            if (n.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) n;

                String loc = getElementValue(elem, "loc");

                URL url = null;
                try {
                    url = new URL(loc);
                    String lastMod = getElementValue(elem, "lastmod");
                    String changeFreq = getElementValue(elem, "changefreq");
                    String priority = getElementValue(elem, "priority");
                    boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());

                    if (valid || !strict) {
                        SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
                        sitemap.addSiteMapUrl(sUrl);
                        if (logger.isDebugEnabled()) {
                            StringBuffer sb = new StringBuffer("  ");
                            sb.append(i + 1).append(". ").append(sUrl);
                            logger.debug(sb.toString());
                        }
                    }
                } catch (MalformedURLException e) {
                    // e.printStackTrace();

                    // Can't create an entry with a bad URL
                    logger.debug("Bad url: [" + loc + "]");
                }
            }
        }
        sitemap.setProcessed(true);
        return sitemap;
    }

    /**
     * Parse XML that contains a Sitemap Index. Example Sitemap Index:
     * 
     * <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap>
     * <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> <sitemap>
     * <loc>http://www.example.com/sitemap2.xml.gz</loc> <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex>
     * 
     * @param url - URL of Sitemap Index
     * @param nodeList
     */
    private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {

        logger.debug("Parsing Sitemap Index");

        SiteMapIndex sitemapIndex = new SiteMapIndex(url);
        sitemapIndex.setType(SitemapType.INDEX);

        // Loop through the <sitemap>s
        for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {

            Node firstNode = nodeList.item(i);

            URL sitemapUrl = null;
            Date lastModified = null;

            if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) firstNode;
                String loc = getElementValue(elem, "loc");

                // try the text content when no loc element
                // has been specified
                if (loc == null) {
                    loc = elem.getTextContent().trim();
                }

                try {
                    sitemapUrl = new URL(loc);
                    String lastmod = getElementValue(elem, "lastmod");
                    lastModified = SiteMap.convertToDate(lastmod);

                    // Right now we are not worried about sitemapUrls that point
                    // to different websites.

                    SiteMap s = new SiteMap(sitemapUrl, lastModified);
                    sitemapIndex.addSitemap(s);
                    if (logger.isDebugEnabled()) {
                        StringBuffer sb = new StringBuffer("  ");
                        sb.append(i + 1).append(". ").append(s);
                        logger.debug(sb.toString());
                    }
                } catch (MalformedURLException e) {
                    // e.printStackTrace();

                    // Don't create an entry for a bad URL
                    logger.debug("Bad url: [" + loc + "]");
                }
            }
        }
        sitemapIndex.setProcessed(true);
        return sitemapIndex;
    }

    /**
     * Parse the XML document, looking for "feed" element to determine if it's an Atom doc and "rss" to determine if it's
     * an RSS doc.
     * 
     * @param sitemapUrl
     * @param doc - XML document to parse
     * @throws UnknownFormatException if XML does not appear to be Arom or RSS
     */
    private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException {

        SiteMap sitemap = new SiteMap(sitemapUrl);

        // See if this is an Atom feed by looking for "feed" element
        NodeList list = doc.getElementsByTagName("feed");
        if (list.getLength() > 0) {
            parseAtom(sitemap, (Element) list.item(0), doc);
            sitemap.setProcessed(true);
            return sitemap;
        } else {
            // See if RSS feed by looking for "rss" element
            list = doc.getElementsByTagName("rss");
            if (list.getLength() > 0) {
                parseRSS(sitemap, doc);
                sitemap.setProcessed(true);
                return sitemap;
            } else {
                throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
            }
        }
    }

    /**
     * Parse the XML document which is assumed to be in Atom format. Atom 1.0 example:
     * 
     * <?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom">
     * 
     * <title>Example Feed</title> <subtitle>A subtitle.</subtitle> <link href="http://example.org/feed/" rel="self"/>
     * <link href="http://example.org/"/> <modified>2003-12-13T18:30:02Z</modified> <author> <name>John Doe</name>
     * <email>johndoe@example.com</email> </author> <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
     * 
     * <entry> <title>Atom-Powered Robots Run Amok</title> <link href="http://example.org/2003/12/13/atom03"/>
     * <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> <summary>Some
     * text.</summary> </entry>
     * 
     * </feed>
     * 
     * @param elem
     * @param doc
     */
    private void parseAtom(SiteMap sitemap, Element elem, Document doc) {

        // Grab items from <feed><entry><link href="URL" /></entry></feed>
        // Use lastmod date from <feed><modified>DATE</modified></feed>

        logger.debug("Parsing Atom XML");

        sitemap.setType(SitemapType.ATOM);

        String lastMod = getElementValue(elem, "modified");
        logger.debug("lastMod=" + lastMod);

        NodeList list = doc.getElementsByTagName("entry");

        // Loop through the <entry>s
        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {

            Node n = list.item(i);

            if (n.getNodeType() == Node.ELEMENT_NODE) {
                elem = (Element) n;

                String href = getElementAttributeValue(elem, "link", "href");
                logger.debug("href=" + href);

                URL url = null;
                try {
                    url = new URL(href);
                    boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());

                    if (valid || !strict) {
                        SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
                        sitemap.addSiteMapUrl(sUrl);
                        if (logger.isDebugEnabled()) {
                            StringBuffer sb = new StringBuffer("  ");
                            sb.append(i + 1).append(". ").append(sUrl);
                            logger.debug(sb.toString());
                        }
                    }
                } catch (MalformedURLException e) {
                    // Can't create an entry with a bad URL
                    logger.debug("Bad url: [" + href + "]");
                }

            }
        }
    }

    /**
     * Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
     * 
     * <?xml version="1.0"?> <rss version="2.0"> <channel> <title>Lift Off News</title>
     * <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space Exploration.</description>
     * <language>en-us</language> <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate> <lastBuildDate>Tue, 10 Jun 2003
     * 09:41:01 GMT</lastBuildDate> <docs>http://blogs.law.harvard.edu/tech/rss</docs> <generator>Weblog Editor
     * 2.0</generator> <managingEditor>editor@example.com</managingEditor> <webMaster>webmaster@example.com</webMaster>
     * <ttl>5</ttl>
     * 
     * <item> <title>Star City</title> <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
     * <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a
     * crash course in culture, language and protocol at Russia's Star City.</description> <pubDate>Tue, 03 Jun 2003
     * 09:39:21 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item>
     * 
     * <item> <title>Space Exploration</title> <link>http://liftoff.msfc.nasa.gov/</link> <description>Sky watchers in
     * Europe, Asia, and parts of Alaska and Canada will experience a partial eclipse of the Sun on Saturday, May
     * 31.</description> <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
     * <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> </item>
     * 
     * </channel> </rss>
     * 
     * @param sitemap
     * @param doc
     */
    private void parseRSS(SiteMap sitemap, Document doc) {

        // Grab items from <item><link>URL</link></item>
        // and last modified date from <pubDate>DATE</pubDate>

        logger.debug("Parsing RSS doc");
        sitemap.setType(SitemapType.RSS);
        NodeList list = doc.getElementsByTagName("channel");
        Element elem = (Element) list.item(0);

        // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
        String lastMod = getElementValue(elem, "pubDate");

        logger.debug("lastMod=" + lastMod);

        list = doc.getElementsByTagName("item");

        // Loop through the <item>s
        for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {

            Node n = list.item(i);

            if (n.getNodeType() == Node.ELEMENT_NODE) {
                elem = (Element) n;

                String link = getElementValue(elem, "link");
                logger.debug("link=" + link);

                try {
                    URL url = new URL(link);
                    boolean valid = urlIsLegal(sitemap.getBaseUrl(), url.toString());

                    if (valid || !strict) {
                        SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
                        sitemap.addSiteMapUrl(sUrl);
                        if (logger.isDebugEnabled()) {
                            StringBuffer sb = new StringBuffer("  ");
                            sb.append(i + 1).append(". ").append(sUrl);
                            logger.debug(sb.toString());
                        }
                    }
                } catch (MalformedURLException e) {
                    // Can't create an entry with a bad URL
                    logger.debug("Bad url: [" + link + "]");
                }
            }
        }
    }

    /**
     * Get the element's textual content.
     * 
     * @param elem
     * @param elementName
     * @return
     */
    private String getElementValue(Element elem, String elementName) {

        NodeList list = elem.getElementsByTagName(elementName);
        Element e = (Element) list.item(0);
        if (e != null) {
            NodeList children = e.getChildNodes();
            if (children.item(0) != null) {
                return ((Node) children.item(0)).getNodeValue().trim();
            }
        }

        return null;
    }

    /**
     * Get the element's attribute value.
     * 
     * @param elem
     * @param elementName
     * @param attributeName
     * @return
     */
    private String getElementAttributeValue(Element elem, String elementName, String attributeName) {

        NodeList list = elem.getElementsByTagName(elementName);
        Element e = (Element) list.item(0);
        if (e != null) {
            return e.getAttribute(attributeName);
        }

        return null;
    }

    /**
     * See if testUrl is under sitemapUrl. Only URLs under sitemapUrl are legal. Both URLs are first converted to
     * lowercase before the comparison is made (this could be an issue on web servers that are case sensitive).
     * 
     * @param sitemapUrl
     * @param testUrl
     * @return true if testUrl is under sitemapUrl, false otherwise
     */
    protected static boolean urlIsLegal(String sitemapBaseUrl, String testUrl) {

        boolean ret = false;

        // Don't try a comparison if the URL is too short to match
        if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
            String u = testUrl.substring(0, sitemapBaseUrl.length()).toLowerCase();
            ret = sitemapBaseUrl.toLowerCase().equals(u);
        }
        if (logger.isTraceEnabled()) {
            StringBuffer sb = new StringBuffer("urlIsLegal: ");
            sb.append(sitemapBaseUrl).append(" <= ").append(testUrl);
            sb.append(" ? ").append(ret);
            logger.trace(sb.toString());
        }

        return ret;
    }

}