de.nava.informa.parsers.RSS_2_0_Parser.java Source code

Java tutorial

Introduction

Here is the source code for de.nava.informa.parsers.RSS_2_0_Parser.java

Source

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//

package de.nava.informa.parsers;

import de.nava.informa.core.*;
import de.nava.informa.impl.basic.ChannelBuilder;
import de.nava.informa.utils.ParserUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom2.Attribute;
import org.jdom2.Element;
import org.jdom2.Namespace;

import java.net.URL;
import java.util.*;

/**
 * Parser which reads in document instances according to the RSS 2.0
 * specification and generates a news channel object.
 *
 * @author Anthony Eden
 * @author Niko Schmuck
 * @author Benjamin Wiedmann
 */
class RSS_2_0_Parser implements ChannelParserIF {

    private static Log logger = LogFactory.getLog(RSS_2_0_Parser.class);

    /**
     * Private constructor suppresses generation of a (public) default constructor.
     */
    private RSS_2_0_Parser() {
    }

    /**
     * Holder of the RSS_2_0_Parser instance.
     */
    private static class RSS_2_0_ParserHolder {
        private static RSS_2_0_Parser instance = new RSS_2_0_Parser();
    }

    /**
     * Get the RSS_2_0_Parser instance.
     */
    public static RSS_2_0_Parser getInstance() {
        return RSS_2_0_ParserHolder.instance;
    }

    private CategoryIF getCategoryList(CategoryIF parent, String title, Hashtable children) {
        // Assuming category hierarchy for each category element
        // is already mapped out into Hashtable tree;  Hense the children Hashtable

        // create channel builder to help create CategoryIF objects
        ChannelBuilder builder = new ChannelBuilder();

        // create current CategoryIF object; Parent may be null if at top level
        CategoryIF cat = builder.createCategory(parent, title);
        // iterate off list of keys from children list
        Enumeration itChild = children.keys();
        while (itChild.hasMoreElements()) {
            String childKey = (String) itChild.nextElement();
            // don't need to keep track of return CategoryIF since it will be added as child of another instance
            getCategoryList(cat, childKey, (Hashtable) children.get(childKey));
        }
        return cat;
    }

    /**
     * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom2.Element)
     */
    public ChannelIF parse(ChannelBuilderIF cBuilder, Element root) throws ParseException {
        if (cBuilder == null) {
            throw new RuntimeException("Without builder no channel can be created.");
        }
        Date dateParsed = new Date();
        logger.debug("start parsing.");

        Namespace defNS = ParserUtils.getDefaultNS(root);
        if (defNS == null) {
            defNS = Namespace.NO_NAMESPACE;
            logger.info("No default namespace found.");
        }
        Namespace dcNS = ParserUtils.getNamespace(root, "dc");
        // fall back to default name space
        if (dcNS == null) {
            dcNS = defNS;
        }

        // Content namespace
        Namespace contentNS = ParserUtils.getNamespace(root, "content");
        // fall back to default name space
        if (contentNS == null) {
            contentNS = defNS;
        }

        ParserUtils.matchCaseOfChildren(root, "channel");

        // Get the channel element (only one occurs)
        Element channel = root.getChild("channel", defNS);
        if (channel == null) {
            logger.warn("Channel element could not be retrieved from feed.");
            throw new ParseException("No channel element found in feed.");
        }

        // --- read in channel information

        ParserUtils.matchCaseOfChildren(channel,
                new String[] { "title", "description", "link", "language", "item", "image", "textinput",
                        "copyright", "rating", "docs", "generator", "pubDate", "lastBuildDate", "category",
                        "managingEditor", "webMaster", "cloud" });

        // 1 title element
        ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS));

        // set channel format
        chnl.setFormat(ChannelFormat.RSS_2_0);

        // 1 description element
        chnl.setDescription(channel.getChildTextTrim("description", defNS));

        // 1 link element
        chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim("link", defNS)));

        // 1 language element
        chnl.setLanguage(channel.getChildTextTrim("language", defNS));

        // 1..n item elements
        List items = channel.getChildren("item", defNS);
        for (Object item1 : items) {
            Element item = (Element) item1;

            ParserUtils.matchCaseOfChildren(item,
                    new String[] { "title", "link", "encoded", "description", "subject", "category", "pubDate",
                            "date", "author", "creator", "comments", "guid", "source", "enclosure" });

            // get title element
            Element elTitle = item.getChild("title", defNS);
            String strTitle = "<No Title>";
            if (elTitle != null) {
                strTitle = elTitle.getTextTrim();
            }
            if (logger.isDebugEnabled()) {
                logger.debug("Item element found (" + strTitle + ").");
            }

            // get link element
            Element elLink = item.getChild("link", defNS);
            String strLink = "";
            if (elLink != null) {
                strLink = elLink.getTextTrim();
            }

            // get description element
            Element elDesc = item.getChild("encoded", contentNS);
            if (elDesc == null) {
                elDesc = item.getChild("description", defNS);
            }
            String strDesc = "";
            if (elDesc != null) {
                strDesc = elDesc.getTextTrim();
            }

            // generate new RSS item (link to article)
            ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink));

            // get subject element
            Element elSubject = item.getChild("subject", defNS);
            if (elSubject == null) {
                // fallback mechanism: get dc:subject element
                elSubject = item.getChild("subject", dcNS);
            }
            if (elSubject != null) {
                rssItem.setSubject(elSubject.getTextTrim());
            }

            // get category list
            // get list of <category> elements
            List listCategory = item.getChildren("category", defNS);
            if (listCategory.size() < 1) {
                // fallback mechanism: get dc:category element
                listCategory = item.getChildren("category", dcNS);
            }
            if (listCategory.size() > 0) {
                RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();

                // for each category, parse hierarchy
                for (Object aListCategory : listCategory) {
                    RecursiveHashtable<String> currTable = catTable;
                    Element elCategory = (Element) aListCategory;
                    // get contents of category element
                    String[] titles = elCategory.getTextNormalize().split("/");
                    for (String title : titles) {
                        // tokenize category string to extract out hierarchy
                        if (!currTable.containsKey(title)) {
                            // if token does not exist in current map, add it with child Hashtable
                            currTable.put(title, new RecursiveHashtable<String>());
                        }
                        // reset current Hashtable to child's Hashtable then iterate to next token
                        currTable = currTable.get(title);
                    }
                }
                ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
                // transform cat list & hierarchy into list of CategoryIF elements
                Enumeration<String> enumCategories = catTable.keys();
                while (enumCategories.hasMoreElements()) {
                    String key = enumCategories.nextElement();
                    // build category list: getCategoryList(parent, title, children)
                    CategoryIF cat = getCategoryList(null, key, catTable.get(key));
                    catList.add(cat);
                }
                if (catList.size() > 0) {
                    // if categories were actually created, then add list to item node
                    rssItem.setCategories(catList);
                }
            }

            // get publication date
            Element elDate = item.getChild("pubDate", defNS);
            if (elDate == null) {
                // fallback mechanism: get dc:date element
                elDate = item.getChild("date", dcNS);
            }
            if (elDate != null) {
                rssItem.setDate(ParserUtils.getDate(elDate.getTextTrim()));
            }

            rssItem.setFound(dateParsed);

            // get Author element
            Element elAuthor = item.getChild("author", defNS);
            if (elAuthor == null) {
                // fallback mechanism: get dc:creator element
                elAuthor = item.getChild("creator", dcNS);
            }
            if (elAuthor != null)
                rssItem.setCreator(elAuthor.getTextTrim());

            // get Comments element
            Element elComments = item.getChild("comments", defNS);
            String strComments = "";
            if (elComments != null) {
                strComments = elComments.getTextTrim();
            }
            rssItem.setComments(ParserUtils.getURL(strComments));

            // get guid element
            Element elGuid = item.getChild("guid", defNS);
            if (elGuid != null) {
                String guidUrl = elGuid.getTextTrim();
                if (guidUrl != null) {
                    boolean permaLink = true;
                    Attribute permaLinkAttribute = elGuid.getAttribute("isPermaLink", defNS);
                    if (permaLinkAttribute != null) {
                        String permaLinkStr = permaLinkAttribute.getValue();
                        if (permaLinkStr != null) {
                            permaLink = Boolean.valueOf(permaLinkStr);
                        }
                    }
                    ItemGuidIF itemGuid = cBuilder.createItemGuid(rssItem, guidUrl, permaLink);
                    rssItem.setGuid(itemGuid);
                }
            }

            // get source element
            Element elSource = item.getChild("source", defNS);
            if (elSource != null) {
                String sourceName = elSource.getTextTrim();
                Attribute sourceAttribute = elSource.getAttribute("url", defNS);
                if (sourceAttribute != null) {
                    String sourceLocation = sourceAttribute.getValue().trim();
                    ItemSourceIF itemSource = cBuilder.createItemSource(rssItem, sourceName, sourceLocation, null);
                    rssItem.setSource(itemSource);
                }
            }

            // get enclosure element
            Element elEnclosure = item.getChild("enclosure", defNS);
            if (elEnclosure != null) {
                URL location = null;
                String type = null;
                int length = -1;
                Attribute urlAttribute = elEnclosure.getAttribute("url", defNS);
                if (urlAttribute != null) {
                    location = ParserUtils.getURL(urlAttribute.getValue().trim());
                }
                Attribute typeAttribute = elEnclosure.getAttribute("type", defNS);
                if (typeAttribute != null) {
                    type = typeAttribute.getValue().trim();
                }
                Attribute lengthAttribute = elEnclosure.getAttribute("length", defNS);
                if (lengthAttribute != null) {
                    try {
                        length = Integer.parseInt(lengthAttribute.getValue().trim());
                    } catch (NumberFormatException e) {
                        logger.warn(e);
                    }
                }
                ItemEnclosureIF itemEnclosure = cBuilder.createItemEnclosure(rssItem, location, type, length);
                rssItem.setEnclosure(itemEnclosure);
            }
        }

        // 0..1 image element
        Element image = channel.getChild("image", defNS);
        if (image != null) {

            ParserUtils.matchCaseOfChildren(image,
                    new String[] { "title", "url", "link", "width", "height", "description" });

            ImageIF rssImage = cBuilder.createImage(image.getChildTextTrim("title", defNS),
                    ParserUtils.getURL(image.getChildTextTrim("url", defNS)),
                    ParserUtils.getURL(image.getChildTextTrim("link", defNS)));
            Element imgWidth = image.getChild("width", defNS);
            if (imgWidth != null) {
                try {
                    rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim()));
                } catch (NumberFormatException e) {
                    logger.warn("Error parsing width: " + e.getMessage());
                }
            }
            Element imgHeight = image.getChild("height", defNS);
            if (imgHeight != null) {
                try {
                    rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim()));
                } catch (NumberFormatException e) {
                    logger.warn("Error parsing height: " + e.getMessage());
                }
            }
            Element imgDescr = image.getChild("description", defNS);
            if (imgDescr != null) {
                rssImage.setDescription(imgDescr.getTextTrim());
            }
            chnl.setImage(rssImage);
        }

        // 0..1 textinput element
        Element txtinp = channel.getChild("textinput", defNS);
        if (txtinp != null) {

            ParserUtils.matchCaseOfChildren(txtinp, new String[] { "title", "description", "name", "link" });

            TextInputIF rssTextInput = cBuilder.createTextInput(txtinp.getChildTextTrim("title", defNS),
                    txtinp.getChildTextTrim("description", defNS), txtinp.getChildTextTrim("name", defNS),
                    ParserUtils.getURL(txtinp.getChildTextTrim("link", defNS)));
            chnl.setTextInput(rssTextInput);
        }

        // 0..1 copyright element
        Element copyright = channel.getChild("copyright", defNS);
        if (copyright != null) {
            chnl.setCopyright(copyright.getTextTrim());
        }

        // 0..1 Rating element
        Element rating = channel.getChild("rating", defNS);
        if (rating != null) {
            chnl.setRating(rating.getTextTrim());
        }

        // 0..1 Docs element
        Element docs = channel.getChild("docs", defNS);
        if (docs != null) {
            chnl.setDocs(docs.getTextTrim());
        }

        // 0..1 Generator element
        Element generator = channel.getChild("generator", defNS);
        if (generator != null) {
            chnl.setGenerator(generator.getTextTrim());
        }

        // 0..1 ttl element
        Element ttl = channel.getChild("ttl", defNS);
        if (ttl != null) {
            String ttlValue = ttl.getTextTrim();
            try {
                chnl.setTtl(Integer.parseInt(ttlValue));
            } catch (NumberFormatException e) {
                logger.warn("Invalid TTL format: '" + ttlValue + "'");
            }
        }

        // 0..1 pubDate element
        Element pubDate = channel.getChild("pubDate", defNS);
        if (pubDate != null) {
            chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim()));
        }

        // 0..1 lastBuildDate element
        Element lastBuildDate = channel.getChild("lastBuildDate", defNS);
        if (lastBuildDate != null) {
            chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate.getTextTrim()));
        }

        // get category list
        // get list of <category> elements
        List listCategory = channel.getChildren("category", defNS);
        if (listCategory.size() < 1) {
            // fallback mechanism: get dc:category element
            listCategory = channel.getChildren("category", dcNS);
        }
        if (listCategory.size() > 0) {
            RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();
            // for each category, parse hierarchy
            for (Object aListCategory : listCategory) {
                RecursiveHashtable<String> currTable = catTable;
                Element elCategory = (Element) aListCategory;
                // get contents of category element
                String[] titles = elCategory.getTextNormalize().split("/");
                for (String title : titles) {
                    // tokenize category string to extract out hierarchy
                    if (!currTable.containsKey(title)) {
                        // if token does not exist in current map, add it with child Hashtable
                        currTable.put(title, new RecursiveHashtable<String>());
                    }
                    // reset current Hashtable to child's Hashtable then iterate to next token
                    currTable = currTable.get(title);
                }
            }
            ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
            // transform cat list & hierarchy into list of CategoryIF elements
            Enumeration<String> enumCategories = catTable.keys();
            while (enumCategories.hasMoreElements()) {
                String key = enumCategories.nextElement();
                // build category list: getCategoryList(parent, title, children)
                CategoryIF cat = getCategoryList(null, key, catTable.get(key));
                catList.add(cat);
            }
            if (catList.size() > 0) {
                // if categories were actually created, then add list to item node
                chnl.setCategories(catList);
            }
        }

        // 0..1 managingEditor element
        Element managingEditor = channel.getChild("managingEditor", defNS);
        if (managingEditor != null) {
            chnl.setCreator(managingEditor.getTextTrim());
        }

        // 0..1 webMaster element
        Element webMaster = channel.getChild("webMaster", defNS);
        if (webMaster != null) {
            chnl.setPublisher(webMaster.getTextTrim());
        }

        // 0..1 cloud element
        Element cloud = channel.getChild("cloud", defNS);
        if (cloud != null) {
            String _port = cloud.getAttributeValue("port", defNS);
            int port = -1;
            if (_port != null) {
                try {
                    port = Integer.parseInt(_port);
                } catch (NumberFormatException e) {
                    logger.warn(e);
                }
            }
            chnl.setCloud(cBuilder.createCloud(cloud.getAttributeValue("domain", defNS), port,
                    cloud.getAttributeValue("path", defNS), cloud.getAttributeValue("registerProcedure", defNS),
                    cloud.getAttributeValue("protocol", defNS)));
        }

        chnl.setLastUpdated(dateParsed);

        // 0..1 skipHours element
        // 0..1 skipDays element

        return chnl;
    }

    /**
     * Implement type safety in a hashtable of hashtables.
     *
     * @author Italo Borssatto
     */
    private static class RecursiveHashtable<T> extends Hashtable<T, RecursiveHashtable<T>> {
        /**
         * <code>serialVersionUID</code>
         */
        private static final long serialVersionUID = -3748524793347081535L;

        /**
         * @see java.util.Hashtable#put(java.lang.Object, java.lang.Object)
         */
        @Override
        public synchronized RecursiveHashtable<T> put(T key, RecursiveHashtable<T> value) {
            return super.put(key, value);
        }
    }

}