org.eclipse.smila.connectivity.framework.crawler.web.http.SitemapParser.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.smila.connectivity.framework.crawler.web.http.SitemapParser.java

Source

/***********************************************************************************************************************
 * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
 * materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
 * and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator
 * Sebastian Voigt (brox IT Solutions GmbH)
 **********************************************************************************************************************/
package org.eclipse.smila.connectivity.framework.crawler.web.http;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configurable;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * This class handles the parsing of <code>sitemap.xml</code> files.
 */
public class SitemapParser implements Configurable {

    /** The Log. */
    private static final Log LOG = LogFactory.getLog(SitemapParser.class);

    /** The Constant CACHE. */
    private static final Hashtable<String, Outlink[]> CACHE = new Hashtable<String, Outlink[]>();

    /** The Constant EMPTY_LINKS. */
    private static final Outlink[] EMPTY_LINKS = new Outlink[0];

    /** The Constant SITEMAP_FILENAME. */
    private static final String[] SITEMAP_FILENAME = { "sitemap.xml", "sitemap.xml.gz", "sitemap.gz" };

    /** The _conf. */
    private Configuration _conf;

    /**
     * Creates new SitemapParser with the given configuration.
     * 
     * @param conf
     *          Configuration
     */
    public SitemapParser(Configuration conf) {
        setConf(conf);
    }

    /**
     * {@inheritDoc}
     */
    public void setConf(Configuration conf) {
        _conf = conf;
    }

    /**
     * {@inheritDoc}
     */
    public Configuration getConf() {
        return _conf;
    }

    /**
     * Returns a {@link Outlink} array with links extracted from site map file.
     * 
     * @param sitemapContent
     *          the site map content
     * 
     * @return the outlink[]
     */
    Outlink[] parseSitemapLinks(byte[] sitemapContent) {
        if (sitemapContent == null) {
            return EMPTY_LINKS;
        }
        final List<Outlink> sitemapLinks = new ArrayList<Outlink>();
        try {
            final DocumentBuilder documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            final InputStream content = new ByteArrayInputStream(sitemapContent);
            final Document document = documentBuilder.parse(content);
            final Element element = document.getDocumentElement();
            final NodeList childNodeList = element.getChildNodes();
            for (int i = 0; i < childNodeList.getLength(); i++) {
                final Node sitemapNode = childNodeList.item(i);
                if (sitemapNode instanceof Element) {
                    final Element sitemapElements = (Element) sitemapNode;
                    final String sitemapElementsName = sitemapElements.getNodeName();
                    if ("url".equals(sitemapElementsName)) {
                        final NodeList urlItemsNodeList = sitemapElements.getChildNodes();
                        for (int j = 0; j < urlItemsNodeList.getLength(); j++) {
                            final Node urlItemNode = urlItemsNodeList.item(j);
                            if (urlItemNode instanceof Element) {
                                final Element urlItemElement = (Element) urlItemNode;
                                final String urlItemElementName = urlItemElement.getNodeName();
                                if ("loc".equals(urlItemElementName)) {
                                    sitemapLinks.add(new Outlink(urlItemElement.getTextContent(),
                                            urlItemElement.getTextContent(), _conf));
                                }
                            }
                        }
                    }
                }
            }
        } catch (MalformedURLException exception) {
            LOG.error("Error creationg outlink while parsing sitemap");
        } catch (Exception exception) {
            LOG.error("Error parsing sitemap");
            return EMPTY_LINKS;
        }

        final Outlink[] linksArray = new Outlink[sitemapLinks.size()];
        sitemapLinks.toArray(linksArray);
        return linksArray;
    }

    /**
     * Returns a set of {@link Outlink}s extracted from sitemap.xml loc tag.
     * 
     * @param http
     *          HttpBase object used to fetch sitemap.xml file.
     * @param url
     *          URL to fetch site map for.
     * 
     * @return Outlink[] Array of extracted outlinks. If no outlinks were found returns an empty array.
     */
    public Outlink[] getSitemapLinks(HttpBase http, URL url) {
        return getSitemapLinks(http, url, 0);
    }

    /**
     * Gets the site map links.
     * 
     * @param http
     *          the HTTP
     * @param url
     *          the URL
     * @param sitemapFilename
     *          the site map filename
     * 
     * @return the site map links
     */
    private Outlink[] getSitemapLinks(HttpBase http, URL url, int sitemapFilename) {

        final String host = url.getHost().toLowerCase(); // normalize to lower case

        Outlink[] sitemapLinks = CACHE.get(host);

        boolean cacheSitemap = true;

        if (sitemapLinks == null) { // cache miss
            if (LOG.isTraceEnabled()) {
                LOG.trace("cache miss " + url);
            }
            try {
                final URL sitemapUrl = new URL(url, "/" + SITEMAP_FILENAME[sitemapFilename]);
                final Response response = http.getResponse(sitemapUrl.toString());
                byte[] content;
                if (response.getCode() == HttpResponseCode.CODE_200) { // found site map: parse it
                    content = response.getContent();
                    sitemapLinks = parseSitemapLinks(content);
                } else if ((response.getCode() == HttpResponseCode.CODE_403)) {
                    sitemapLinks = EMPTY_LINKS;
                    // if sitemap.xml wasn't found try to fetch GZIP site map file (sitemap.xml.gz or sitemap.gz)
                } else if ((response.getCode() == HttpResponseCode.CODE_404)
                        && (sitemapFilename < SITEMAP_FILENAME.length - 1)) {
                    return getSitemapLinks(http, url, sitemapFilename + 1);
                } else if (response.getCode() >= HttpResponseCode.CODE_500) {
                    cacheSitemap = false;
                    sitemapLinks = EMPTY_LINKS;
                } else {
                    sitemapLinks = EMPTY_LINKS;
                }
            } catch (Exception exception) {
                LOG.error("Couldn't get sitemap.xml for " + url + ": " + exception.toString());
                cacheSitemap = false;
                sitemapLinks = EMPTY_LINKS;
            }

            if (cacheSitemap) {
                CACHE.put(host, sitemapLinks); // cache site map for host
            }
        } else {
            // if site map was cached don't return site map links another time
            sitemapLinks = EMPTY_LINKS;
        }
        return sitemapLinks;
    }

}