de.nava.informa.parsers.Atom_1_0_Parser.java Source code

Introduction

Here is the source code for de.nava.informa.parsers.Atom_1_0_Parser.java
Source

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//

package de.nava.informa.parsers;

import de.nava.informa.core.*;
import de.nava.informa.impl.basic.Category;
import de.nava.informa.utils.AtomParserUtils;
import de.nava.informa.utils.ParserUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom2.Element;
import org.jdom2.Namespace;

import java.net.URL;
import java.util.*;

/**
 * Parser which reads in document instances according to the Atom 1.0
 * specification and generates a news channel object. Currently the support for
 * the atom syntax is not complete, but work is in progress.
 * <p>
 * This parser is based on code of Atom 0.3 parser. Some good sources of
 * information regarding Atom 1.0 are:
 * </p>
 * <ul>
 *   <li>http://www.atomenabled.org/developers/syndication/atom-format-spec.php</li>
 *   <li>http://rakaz.nl/item/moving_from_atom_03_to_10</li>
 *   <li>http://www.atomenabled.org/developers/syndication/</li>
 *   <li>http://www-128.ibm.com/developerworks/xml/library/x-atom10.html</li>
 * </ul>
 *
 * @author Nilesh Bansal
 * @author Benjamin Wiedmann
 */
public class Atom_1_0_Parser implements ChannelParserIF {
    private static final Log LOGGER = LogFactory.getLog(Atom_1_0_Parser.class);

    /**
     * Private constructor suppresses generation of a (public) default constructor.
     */
    private Atom_1_0_Parser() {
    }

    /**
     * Holder of the Atom_1_0_Parser instance.
     */
    private static class Atom_1_0_ParserHolder {
        private static Atom_1_0_Parser instance = new Atom_1_0_Parser();
    }

    /**
     * Get the Atom_1_0_Parser instance.
     */
    public static Atom_1_0_Parser getInstance() {
        return Atom_1_0_ParserHolder.instance;
    }

    /**
     * a semicolon separated list of authors
     */
    static String getAuthorString(List authors, Namespace defNS) {
        String authorName = "";

        for (Object author : authors) {
            Element authorElt = (Element) author;

            if (authorElt != null) {
                //TODO author may have more information like uri and email
                ParserUtils.matchCaseOfChildren(authorElt, "name");

                if (!"".equals(authorName)) {
                    // if more than one author, a ; separated list
                    authorName += "; ";
                }

                authorName += authorElt.getChildTextTrim("name", defNS);
            }
        }

        return authorName;
    }

    /**
     * Returns the content type of element. Default is 'text'.
     */
    private static String getContentType(Element elt) {
        String type = elt.getAttributeValue("type");

        return (type == null) ? "text" : type;
    }

    /**
     * Looks for "content" elements and takes first from them or looks for
     * "summary" element if "content" not found.
     *
     * @param item      item element.
     * @param namespace namespace.
     * @return description for item.
     */
    public static String getDescription(Element item, Namespace namespace) {
        String strDesc = "";
        Element elDesc;

        // TODO there should be some way of knowing if we are returning summary or
        // content
        List contents = item.getChildren("content", namespace);

        if (contents.size() > 0) {
            elDesc = (Element) contents.get(0);
        } else {
            elDesc = item.getChild("summary", namespace);
        }

        if (elDesc != null) {
            strDesc = AtomParserUtils.getValue(elDesc, getMode(elDesc));
        }

        return strDesc;
    }

    /**
     * returns mode of a element based on its mime type
     */
    static String getMode(Element elt) {
        return getMode(getContentType(elt));
    }

    static String getMode(String type) {
        if (type == null) {
            // if type is not specified, text is to be assumed
            return "escaped";
        }

        if ("text".equals(type) || "html".equals(type)) {
            return "escaped";
        } else if ("xhtml".equals(type)) {
            return "xml";
        } else if (type.substring(type.length() - "xml".length()).equals("xml")) {
            return "xml";
        } else {
            return "base64";
        }
    }

    /**
     * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom2.Element)
     */
    public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel) throws ParseException {
        if (cBuilder == null) {
            throw new RuntimeException("Without builder no channel can " + "be created.");
        }

        Date dateParsed = new Date();
        Namespace defNS = ParserUtils.getDefaultNS(channel);

        if (defNS == null) {
            defNS = Namespace.NO_NAMESPACE;
            LOGGER.info("No default namespace found.");
        } else if ((defNS.getURI() == null) || !defNS.getURI().equals("http://www.w3.org/2005/Atom")) {
            LOGGER.warn("Namespace is not really supported, still trying assuming Atom 1.0 format");
        }

        LOGGER.debug("start parsing.");

        // --- read in channel information

        // Lower the case of these tags to simulate case-insensitive parsing
        ParserUtils.matchCaseOfChildren(channel, new String[] { "title", "subtitle", "updated", "published",
                "author", "generator", "rights", "link", "entry" });

        // TODO icon and logo: Feed element can have upto 1 logo and icon.
        // TODO id: Feed and all entries have a unique id string. This can
        // be the URL of the website. Supporting this will require API change.
        // TODO: Feed can optionally have category information

        // title element
        ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS));

        chnl.setFormat(ChannelFormat.ATOM_1_0);

        // description element
        if (channel.getChild("subtitle") != null) {
            chnl.setDescription(channel.getChildTextTrim("subtitle", defNS));
        }

        // TODO: should we use summary element?

        // lastbuild element : updated ?
        Element updated = channel.getChild("updated", defNS);

        if (updated != null) {
            chnl.setPubDate(ParserUtils.getDate(updated.getTextTrim()));
        }

        // author element
        List authors = channel.getChildren("author", defNS);

        chnl.setCreator(getAuthorString(authors, defNS));

        // TODO we are ignoring contributors information

        // generator element
        Element generator = channel.getChild("generator", defNS);

        if (generator != null) {
            chnl.setGenerator(generator.getTextTrim());
        }

        // TODO generator can have URI and version information

        // copyright element
        Element rights = channel.getChild("rights", defNS);

        if (rights != null) {
            chnl.setCopyright(AtomParserUtils.getValue(rights, getMode(rights)));
        }

        List links = channel.getChildren("link", defNS);
        Iterator i = links.iterator();

        URL linkUrl = null;

        while (i.hasNext()) {
            Element linkElement = (Element) i.next();

            // use first 'alternate' link
            // if rel is not present, use first link without rel
            String rel = linkElement.getAttributeValue("rel");
            String href = linkElement.getAttributeValue("href");

            // TODO we need to handle relative links also
            if ((rel == null) && (href != null) && (linkUrl == null)) {
                linkUrl = ParserUtils.getURL(href);
            } else if ((rel != null) && (href != null) && rel.equals("alternate")) {
                linkUrl = ParserUtils.getURL(href);

                break;
            }
        }

        if (linkUrl != null) {
            chnl.setSite(linkUrl);
        }

        List items = channel.getChildren("entry", defNS);

        i = items.iterator();

        while (i.hasNext()) {
            Element item = (Element) i.next();

            // Lower the case of these tags to simulate case-insensitive parsing
            ParserUtils.matchCaseOfChildren(item,
                    new String[] { "title", "link", "content", "summary", "published", "author" });

            // TODO entry, if copied from some other feed, may have source element
            // TODO each entry can have its own rights declaration

            // get title element
            Element elTitle = item.getChild("title", defNS);
            String strTitle = "<No Title>";

            if (elTitle != null) {
                strTitle = AtomParserUtils.getValue(elTitle, getMode(elTitle));
                LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" + strTitle);
            }

            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Entry element found (" + strTitle + ").");
            }

            // get link element
            String strLink = AtomParserUtils.getItemLink(item, defNS);

            // get description element
            String strDesc = getDescription(item, defNS);

            // generate new news item (link to article)
            ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink));

            //TODO enclosure data
            curItem.setFound(dateParsed);

            List itemAuthors = item.getChildren("author", defNS);

            curItem.setCreator(getAuthorString(itemAuthors, defNS));

            // get published element
            Element elIssued = item.getChild("published", defNS);

            if (elIssued == null) {
                // published element may not be present (but updated should be)
                Element elUpdated = item.getChild("updated", defNS);

                // TODO there should be some way to determining which one are we
                // returning
                if (elUpdated != null) {
                    curItem.setDate(ParserUtils.getDate(elUpdated.getTextTrim()));
                }
            } else {
                curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
            }

            // get list of category elements
            List elCategoryList = item.getChildren("category", defNS);

            // categories present will be stored here
            Collection<CategoryIF> categories = new ArrayList<>();

            // multiple category elements may be present
            for (Object elCategoryItem : elCategoryList) {

                Element elCategory = (Element) elCategoryItem;

                // notice: atom spec. forbids to have category "term" (="subject")
                // set as inner text of category tags, so we have to read it from
                // the "term" attribute

                if (elCategory != null) {
                    // TODO: what if we have more than one category element present?
                    // subject would be overwritten each loop and therefore represent only
                    // the last category read, so does this make any sense?

                    // TODO: what about adding functionality for accessing "label" or "scheme" attributes?
                    // if set, a label should be displayed instead of the value set in term

                    // we keep this line not to break up things which
                    // use getSubject() to read an item category
                    curItem.setSubject(elCategory.getAttributeValue("term"));

                    CategoryIF c = new Category(elCategory.getAttributeValue("term"));

                    // add current category to category list
                    categories.add(c);
                }
            }

            // assign categories
            curItem.setCategories(categories);
        }

        // set to current date
        chnl.setLastUpdated(dateParsed);

        return chnl;
    }
}