it.pronetics.madstore.crawler.publisher.impl.AtomPublisherImpl.java Source code

Java tutorial

Introduction

Here is the source code for it.pronetics.madstore.crawler.publisher.impl.AtomPublisherImpl.java

Source

/**
 * Copyright 2008 - 2009 Pro-Netics S.P.A.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package it.pronetics.madstore.crawler.publisher.impl;

import it.pronetics.madstore.common.AtomConstants;
import it.pronetics.madstore.common.dom.DomHelper;
import it.pronetics.madstore.crawler.publisher.AtomPublisher;
import it.pronetics.madstore.crawler.model.Page;
import it.pronetics.madstore.repository.CollectionRepository;
import it.pronetics.madstore.repository.EntryRepository;
import java.net.URL;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * {@link it.pronetics.madstore.crawler.publisher.AtomPublisher} implementation publishing an Atom feed as an AtomPub collection
 * into the {@link it.pronetics.madstore.repository.CollectionRepository}, and all related entries
 * into the {@link it.pronetics.madstore.repository.EntryRepository}.
 * <br><br>
 * Atom feeds are published only if not already existent into the repository, while entries are updated if their
 * publishing date is newer than the one of the already stored entry.
 * <br><br>
 * Atom feeds and entries should have a proper feed and entry key, in order to properly manage updating of entries: if no
 * key is provided, surrogated keys will be automatically generated based on page and URL heuristics, more specifically:
 * <ul>
 * <li>The host name of the crawled site will be used for generating the feed key.</li>
 * <li>An hash of the entry title will be used for generating each entry key.</li>
 * <li>All entries will be inserted under the same collection.</li>
 * </ul>
 * <br>
 * Atom entries should have a proper updated date, too: if no such a date is found, the current one will be used.
 *
 * @author Salvatore Incandela
 * @author Sergio Bossa
 */
public class AtomPublisherImpl implements AtomPublisher {

    private static final Logger LOG = LoggerFactory.getLogger(AtomPublisherImpl.class);
    private EntryRepository entryRepository;
    private CollectionRepository collectionRepository;

    public void setEntryRepository(EntryRepository entryRepository) {
        this.entryRepository = entryRepository;
    }

    public void setCollectionRepository(CollectionRepository collectionRepository) {
        this.collectionRepository = collectionRepository;
    }

    @Transactional(propagation = Propagation.REQUIRED, readOnly = false)
    public void publish(Page page) {
        try {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Publishing feed:\n{}.", page.getData());
            }
            Element feed = DomHelper.getDomFeedFromString(page.getData());
            String collectionKey = getOrGenerateCollectionKey(page, feed);
            String collectionTitle = getOrGenerateCollectionTitle(page, feed);
            String collectionHref = collectionKey;
            Element collectionElement = createCollectionElement(collectionKey, collectionHref, collectionTitle);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Publishing collection:\n{}.", DomHelper.getStringFromDomElement(collectionElement));
            }
            String newCollectionKey = collectionRepository.putIfAbsent(collectionElement);
            if (newCollectionKey != null) {
                LOG.info("Inserted collection with key {}.", collectionKey);
            } else {
                LOG.info("Collection {} already existent.", collectionKey);
            }
            NodeList entryNodes = feed.getElementsByTagNameNS(AtomConstants.ATOM_NS, AtomConstants.ATOM_ENTRY);
            if (entryNodes != null && entryNodes.getLength() > 0) {
                for (int i = 0; i < entryNodes.getLength(); i++) {
                    Element entry = (Element) entryNodes.item(i);
                    String entryKey = getOrGenerateEntryKey(entry);
                    setUpdatedDateTimeIfNecessary(entry);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Publishing entry:\n{}.", DomHelper.getStringFromDomElement(entry));
                    }
                    String newEntryKey = entryRepository.putIfAbsent(collectionKey, entry);
                    if (newEntryKey != null) {
                        LOG.info("Entry with key {} inserted in collection {}.", entryKey, collectionKey);
                    } else {
                        LOG.info("Entry with key {} already existent in collection {}.", entryKey, collectionKey);
                        String updatedEntryKey = entryRepository.updateIfNewer(collectionKey, entry);
                        if (updatedEntryKey != null) {
                            LOG.info("Entry with key {} in collection {} was updated.", entryKey, collectionKey);
                        } else {
                            LOG.info("Entry with key {} wasn't updated because is older.", entryKey);
                        }
                    }
                }
            } else {
                LOG.info("No entries for {}", page.getLink());
            }
        } catch (Exception e) {
            LOG.info("Publishing abnormally terminated: {}", page.getLink());
            LOG.warn(e.getMessage());
            LOG.debug(e.getMessage(), e);
        }
    }

    private Element createCollectionElement(String key, String href, String title) throws Exception {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document collectionDocument = builder.newDocument();
        Element collectionelElement = collectionDocument.createElementNS(AtomConstants.APP_NS,
                AtomConstants.ATOM_COLLECTION);
        collectionelElement.setAttribute(AtomConstants.ATOM_KEY, key);
        collectionelElement.setAttribute(AtomConstants.ATOM_COLLECTION_HREF, href);
        Element acceptElement = collectionDocument.createElementNS(AtomConstants.APP_NS,
                AtomConstants.ATOM_COLLECTION_ACCEPT);
        Element titleElement = collectionDocument.createElementNS(AtomConstants.ATOM_NS,
                AtomConstants.ATOM_COLLECTION_TITLE);
        titleElement.setTextContent(title);
        collectionelElement.appendChild(acceptElement);
        collectionelElement.appendChild(titleElement);
        collectionDocument.appendChild(collectionelElement);
        return collectionelElement;
    }

    private String getOrGenerateCollectionKey(Page page, Element feed) throws Exception {
        String key = feed.getAttribute(AtomConstants.ATOM_KEY);
        if (key == null || key.equals("")) {
            LOG.warn("No feed key found, generating surrogate key ...");
            URL url = new URL(page.getLink().getLink());
            String path = url.getHost();
            if (path.startsWith("/")) {
                path = path.substring(1);
            }
            if (path.endsWith("/")) {
                path = path.substring(0, path.length() - 1);
            }
            key = path.replaceAll("\\.", "_").replaceAll("/", "-").replaceAll("\\:", "-").replaceAll("\\,", "-");
            LOG.warn("Surrogated feed key: {}", key);
        }
        return key;
    }

    private String getOrGenerateCollectionTitle(Page page, Element feed) throws Exception {
        String key = feed.getAttribute(AtomConstants.ATOM_KEY);
        if (key == null || key.equals("")) {
            URL url = new URL(page.getLink().getLink());
            return url.getHost();
        } else {
            return feed.getElementsByTagName(AtomConstants.ATOM_COLLECTION_TITLE).item(0).getTextContent();
        }
    }

    private String getOrGenerateEntryKey(Element entry) throws Exception {
        String key = entry.getAttribute(AtomConstants.ATOM_KEY);
        if (key == null || key.equals("")) {
            LOG.warn("No entry key found, generating surrogate key ...");
            NodeList titleNodes = entry.getElementsByTagName(AtomConstants.ATOM_ENTRY_TITLE);
            Node titleNode = titleNodes.item(0);
            if (titleNode != null) {
                int keyCode = titleNode.getTextContent().hashCode();
                if (keyCode < 0) {
                    key = "e" + Integer.toString(keyCode * -1) + "n";
                } else {
                    key = "e" + Integer.toString(keyCode) + "p";
                }
            } else {
                key = Long.toString(System.currentTimeMillis());
            }
            entry.setAttribute(AtomConstants.ATOM_KEY, key);
            LOG.warn("Surrogated entry key: {}", key);
        }
        return key;
    }

    private void setUpdatedDateTimeIfNecessary(Element entry) {
        NodeList updatedNodes = entry.getElementsByTagName(AtomConstants.ATOM_ENTRY_UPDATED);
        Node updatedNode = updatedNodes.item(0);
        if (updatedNode != null) {
            String entryUpdatedDateTime = updatedNode.getTextContent();
            if (entryUpdatedDateTime == null || entryUpdatedDateTime.equals("")) {
                LOG.warn("The entry has no updated date, using current time ...");
                updatedNode.setTextContent(ISODateTimeFormat.dateTime().print(System.currentTimeMillis()));
            }
        } else {
            LOG.warn("The entry has no updated date, using current time ...");
            updatedNode = entry.getOwnerDocument().createElementNS(AtomConstants.ATOM_NS,
                    AtomConstants.ATOM_ENTRY_UPDATED);
            updatedNode.setTextContent(ISODateTimeFormat.dateTime().print(System.currentTimeMillis()));
            entry.appendChild(updatedNode);
        }
    }
}