Java tutorial
/* Copyright 2010-2013 Norconex Inc. * * This file is part of Norconex HTTP Collector. * * Norconex HTTP Collector is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Norconex HTTP Collector is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Norconex HTTP Collector. If not, * see <http://www.gnu.org/licenses/>. */ package com.norconex.collector.http.sitemap.impl; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.Writer; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.zip.GZIPInputStream; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.XMLStreamWriter; import org.apache.commons.configuration.XMLConfiguration; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.builder.EqualsBuilder; import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.commons.lang.builder.ToStringBuilder; import org.apache.commons.lang.builder.ToStringStyle; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.joda.time.DateTime; import com.norconex.collector.http.crawler.CrawlURL; import com.norconex.collector.http.sitemap.ISitemapsResolver; import com.norconex.collector.http.sitemap.SitemapURLStore; import com.norconex.commons.lang.config.ConfigurationLoader; import com.norconex.commons.lang.config.IXMLConfigurable; /** * <p> * Default implementation of {@link ISitemapsResolver}. For any given URL * this class will look in three different places to locate sitemaps: * </p> * <ul> * <li>Sitemap locations explicitly provided via configuration (or setter * method on this class).</li> * <li>The root-level of a URL (e.g. http://example.com/sitemap.xml)</li> * <li>Any sitemaps defined in robots.txt * (automatically passed as arguments to this class if robots.txt are * not ignored)</li> * </ul> * <p> * The Sitemap specifications dictates that a sitemap.xml file defined * in a sub-directory applies only to URLs found in that sub-directory and * its children. This behavior is respected by default. Setting lenient * to <code>true</code> no longer honors this restriction. * </p> * <p> * XML configuration usage (not required since default): * </p> * <pre> * <sitemap ignore="false" lenient="false" * class="com.norconex.collector.http.sitemap.impl.DefaultSitemapResolver"> * <location>(optional location of sitemap.xml)</location> * (... repeat location tag as needed ...) * </sitemap> * </pre> * @author Pascal Essiembre */ public class DefaultSitemapResolver implements ISitemapsResolver, IXMLConfigurable { private static final long serialVersionUID = 4047819847150159618L; private static final Logger LOG = LogManager.getLogger(DefaultSitemapResolver.class); private String[] sitemapLocations; private boolean lenient; @Override public void resolveSitemaps(DefaultHttpClient httpClient, String urlRoot, String[] robotsTxtLocations, SitemapURLStore sitemapURLStore) { final Set<String> resolvedLocations = new HashSet<String>(); Set<String> uniqueLocations = combineLocations(robotsTxtLocations, urlRoot); for (String location : uniqueLocations) { resolveLocation(location, httpClient, sitemapURLStore, resolvedLocations); } } public String[] getSitemapLocations() { return sitemapLocations; } public void setSitemapLocations(String... sitemapLocations) { this.sitemapLocations = sitemapLocations; } public boolean isLenient() { return lenient; } public void setLenient(boolean lenient) { this.lenient = lenient; } @Override public void loadFromXML(Reader in) throws IOException { XMLConfiguration xml = ConfigurationLoader.loadXML(in); setLenient(xml.getBoolean("[@lenient]", false)); setSitemapLocations(xml.getList("location").toArray(ArrayUtils.EMPTY_STRING_ARRAY)); } @Override public void saveToXML(Writer out) throws IOException { XMLOutputFactory factory = XMLOutputFactory.newInstance(); try { XMLStreamWriter writer = factory.createXMLStreamWriter(out); writer.writeStartElement("sitemap"); writer.writeAttribute("class", getClass().getCanonicalName()); writer.writeAttribute("lenient", Boolean.toString(lenient)); if (sitemapLocations != null) { for (String location : sitemapLocations) { writer.writeStartElement("location"); writer.writeCharacters(location); writer.writeEndElement(); } } writer.writeEndElement(); writer.flush(); writer.close(); } catch (XMLStreamException e) { throw new IOException("Cannot save as XML.", e); } } private void resolveLocation(String location, DefaultHttpClient httpClient, SitemapURLStore sitemapURLStore, Set<String> resolvedLocations) { if (resolvedLocations.contains(location)) { return; } HttpGet method = null; try { method = new HttpGet(location); // Execute the method. HttpResponse response = httpClient.execute(method); int statusCode = response.getStatusLine().getStatusCode(); if (statusCode == HttpStatus.SC_OK) { LOG.info("Resolving sitemap: " + location); InputStream is = response.getEntity().getContent(); if ("application/x-gzip".equals(response.getFirstHeader("Content-Type").getValue())) { is = new GZIPInputStream(is); } parseLocation(is, httpClient, sitemapURLStore, resolvedLocations, location); IOUtils.closeQuietly(is); LOG.info(" Resolved: " + location); } else if (statusCode == HttpStatus.SC_NOT_FOUND) { LOG.debug("No sitemap found : " + location); } else { LOG.error("Could not obtain sitemap: " + location + ". Expected status code " + HttpStatus.SC_OK + ", but got " + statusCode); } } catch (Exception e) { LOG.error("Cannot fetch sitemap: " + location + " (" + e.getMessage() + ")"); } finally { resolvedLocations.add(location); if (method != null) { method.releaseConnection(); } } } private void parseLocation(InputStream is, DefaultHttpClient httpClient, SitemapURLStore sitemapURLStore, Set<String> resolvedLocations, String location) throws XMLStreamException { XMLInputFactory inputFactory = XMLInputFactory.newInstance(); inputFactory.setProperty(XMLInputFactory.IS_COALESCING, true); XMLStreamReader xmlReader = inputFactory.createXMLStreamReader(is); ParseState parseState = new ParseState(); String locationDir = StringUtils.substringBeforeLast(location, "/"); int event = xmlReader.getEventType(); while (true) { switch (event) { case XMLStreamConstants.START_ELEMENT: String tag = xmlReader.getLocalName(); parseStartElement(parseState, tag); break; case XMLStreamConstants.CHARACTERS: String value = xmlReader.getText(); if (parseState.sitemapIndex && parseState.loc) { resolveLocation(value, httpClient, sitemapURLStore, resolvedLocations); parseState.loc = false; } else if (parseState.baseURL != null) { parseCharacters(parseState, value); } break; case XMLStreamConstants.END_ELEMENT: tag = xmlReader.getLocalName(); parseEndElement(sitemapURLStore, parseState, locationDir, tag); break; } if (!xmlReader.hasNext()) { break; } event = xmlReader.next(); } } private void parseEndElement(SitemapURLStore sitemapURLStore, ParseState parseState, String locationDir, String tag) { if ("sitemap".equalsIgnoreCase(tag)) { parseState.sitemapIndex = false; } else if ("url".equalsIgnoreCase(tag) && parseState.baseURL.getUrl() != null) { if (isRelaxed(parseState, locationDir)) { sitemapURLStore.add(parseState.baseURL); } else if (LOG.isDebugEnabled()) { LOG.debug("Sitemap URL invalid for location directory." + " URL:" + parseState.baseURL.getUrl() + " Location directory: " + locationDir); } parseState.baseURL = null; } } private boolean isRelaxed(ParseState parseState, String locationDir) { return lenient || parseState.baseURL.getUrl().startsWith(locationDir); } private void parseCharacters(ParseState parseState, String value) { if (parseState.loc) { parseState.baseURL.setUrl(value); parseState.loc = false; } else if (parseState.lastmod) { try { parseState.baseURL.setSitemapLastMod(DateTime.parse(value).getMillis()); } catch (Exception e) { LOG.info("Invalid sitemap date: " + value); } parseState.lastmod = false; } else if (parseState.changefreq) { parseState.baseURL.setSitemapChangeFreq(value); parseState.changefreq = false; } else if (parseState.priority) { try { parseState.baseURL.setSitemapPriority(Float.parseFloat(value)); } catch (NumberFormatException e) { LOG.info("Invalid sitemap priority: " + value); } parseState.priority = false; } } private void parseStartElement(ParseState parseState, String tag) { if ("sitemap".equalsIgnoreCase(tag)) { parseState.sitemapIndex = true; } else if ("url".equalsIgnoreCase(tag)) { parseState.baseURL = new CrawlURL("", 0); } else if ("loc".equalsIgnoreCase(tag)) { parseState.loc = true; } else if ("lastmod".equalsIgnoreCase(tag)) { parseState.lastmod = true; } else if ("changefreq".equalsIgnoreCase(tag)) { parseState.changefreq = true; } else if ("priority".equalsIgnoreCase(tag)) { parseState.priority = true; } } private Set<String> combineLocations(String[] robotsTxtLocations, String urlRoot) { Set<String> uniqueLocations = new HashSet<String>(); uniqueLocations.add(urlRoot + "/sitemap_index.xml"); uniqueLocations.add(urlRoot + "/sitemap.xml"); if (ArrayUtils.isNotEmpty(robotsTxtLocations)) { uniqueLocations.addAll(Arrays.asList(robotsTxtLocations)); } if (ArrayUtils.isNotEmpty(sitemapLocations)) { uniqueLocations.addAll(Arrays.asList(sitemapLocations)); } return uniqueLocations; } @Override public boolean equals(final Object other) { if (!(other instanceof DefaultSitemapResolver)) { return false; } DefaultSitemapResolver castOther = (DefaultSitemapResolver) other; return new EqualsBuilder().append(sitemapLocations, castOther.sitemapLocations) .append(lenient, castOther.lenient).isEquals(); } @Override public int hashCode() { return new HashCodeBuilder().append(sitemapLocations).append(lenient).toHashCode(); } @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) .append("sitemapLocations", sitemapLocations).append("lenient", lenient).toString(); } private class ParseState { private CrawlURL baseURL = null; private boolean sitemapIndex = false; private boolean loc = false; private boolean lastmod = false; private boolean changefreq = false; private boolean priority = false; } }