at.newsagg.parser.Atom_0_3_Parser.java Source code

Java tutorial

Introduction

Here is the source code for at.newsagg.parser.Atom_0_3_Parser.java

Source

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// Niko Schmuck
// http://sourceforge.net/projects/informa
// mailto:niko_schmuck@users.sourceforge.net
//
// This library is free software.
//
// You may redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation.
//
// Version 2.1 of the license should be included with this distribution in
// the file LICENSE. If the license is not included with this distribution,
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
// MA 02139 USA.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied waranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//

// $Id: Atom_0_3_Parser.java,v 1.2 2005/09/16 13:05:51 vecego Exp $

//Roland Vecera
//11.02.2005
//Just added some comments

package at.newsagg.parser;

import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Element;
import org.jdom.Namespace;

import at.newsagg.model.parser.*;
import at.newsagg.utils.ParserUtils;

/**
 * Parser which reads in document instances according to the Atom 0.3
 * specification and generates a news channel object. Currently the
 * support for the atom syntax is not complete.
 *
 * @author Niko Schmuck
 */

class Atom_0_3_Parser implements Atom_0_3_ParserIF {

    private static Log logger = LogFactory.getLog(Atom_0_3_Parser.class);

    public ChannelIF parse(ChannelIF cBuilder, Element channel) throws ParseException {
        if (cBuilder == null) {
            throw new RuntimeException("Without builder no  channel can " + "be created.");
        }
        Date dateParsed = new Date();
        Namespace defNS = ParserUtils.getDefaultNS(channel);
        if (defNS == null) {
            defNS = Namespace.NO_NAMESPACE;
            logger.info("No default namespace found.");
        }
        // RSS 1.0 Dublin Core Module namespace
        Namespace dcNS = ParserUtils.getNamespace(channel, "dc");
        if (dcNS == null) {
            logger.debug("No namespace for dublin core found");
            dcNS = defNS;
        }

        logger.debug("start parsing.");

        // get version attribute

        //TODO: formatVersion is never set somewhere, chnl.setFormat always to ChannelFormat.ATOM_0_3
        // see below!

        String formatVersion = "0.3";
        if (channel.getAttribute("version") != null) {
            formatVersion = channel.getAttribute("version").getValue().trim();
            logger.debug("Atom version " + formatVersion + " specified in document.");
        } else {
            logger.info("No format version specified, using default");
        }

        // --- read in channel information

        // title element

        cBuilder.setTitle(channel.getChildTextTrim("title", defNS));
        ChannelIF chnl = cBuilder;
        // TODO: support attributes: type, mode

        chnl.setFormat(ChannelFormat.ATOM_0_3);

        // language
        String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE);
        if (language != null) {
            chnl.setLanguage(language);
        }

        // description element
        if (channel.getChild("description") != null) {
            chnl.setDescription(channel.getChildTextTrim("description", defNS));
        } else {
            // fallback
            chnl.setDescription(channel.getChildTextTrim("tagline", defNS));
        }

        // ttl in dc namespace
        Element ttl = channel.getChild("ttl", dcNS);
        if (ttl != null) {
            String ttlString = ttl.getTextTrim();
            if (ttlString != null) {
                chnl.setTtl(Integer.parseInt(ttlString));
            }
        }

        //  lastbuild element : modified ?
        Element modified = channel.getChild("modified", defNS);
        if (modified != null) {
            chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim()));
        }

        // TODO : issued value
        /*
        if (modified != null) {
          modified = channel.getChild("issued", defNS);
          chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim()));
        }
        */

        // author element
        Element author = channel.getChild("author", defNS);
        if (author != null) {
            chnl.setCreator(author.getChildTextTrim("name", defNS));
        }

        // generator element
        Element generator = channel.getChild("generator", defNS);
        if (generator != null) {
            chnl.setGenerator(generator.getTextTrim());
        }

        // copyright element
        Element copyright = channel.getChild("copyright", defNS);
        if (copyright != null) {
            chnl.setCopyright(getValue(copyright));
        }

        // n link elements
        // TODO : type attribut of link (text, application...)
        List links = channel.getChildren("link", defNS);
        Iterator i = links.iterator();
        while (i.hasNext()) {
            Element linkElement = (Element) i.next();
            // use first 'alternate' link
            String rel = linkElement.getAttributeValue("rel");
            String href = linkElement.getAttributeValue("href");
            if (rel != null && href != null && rel.equals("alternate")) {
                URL linkURL = ParserUtils.getURL(href);
                chnl.setSite(linkURL);
                break;
            }

            // TODO: further extraction of link information
        }

        //***********************************
        //start <entry>, equals <item> in RSS!
        //***********************************
        // 1..n entry elements
        List items = channel.getChildren("entry", defNS);
        i = items.iterator();
        while (i.hasNext()) {
            Element item = (Element) i.next();

            // get title element
            // TODO : deal with type attribut
            Element elTitle = item.getChild("title", defNS);
            String strTitle = "<No Title>";
            if (elTitle != null) {
                strTitle = getValue(elTitle);
                logger.debug("Parsing title " + elTitle.getTextTrim() + "->" + strTitle);
            }
            if (logger.isDebugEnabled()) {
                logger.debug("Entry element found (" + strTitle + ").");
            }
            // TODO : deal with multiple links
            // get link element
            Element elLink = item.getChild("link", defNS);
            String strLink = "";
            if (elLink != null) {
                strLink = elLink.getAttributeValue("href").trim();
            }
            // TODO dealing with elLink.getAttributeValue("rel")
            logger.info("url read : " + strLink + " ," + elLink.getAttributeValue("rel"));

            // get description element
            Element elDesc = item.getChild("summary", defNS);
            String strDesc = "";
            if (elDesc != null) {
                strDesc = getValue(elDesc); //.getTextTrim();
            }
            // TODO: where to store the content?

            // generate new news item (link to article)
            ItemIF curItem = at.newsagg.model.parser.hibernate.ChannelBuilder.createItem(item, chnl, strTitle,
                    strDesc, ParserUtils.getURL(strLink));
            curItem.setFound(dateParsed);

            // get issued element (required)
            Element elIssued = item.getChild("issued", defNS);
            if (elIssued != null)
                curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
            else
                //    set to found-date
                //weil sonst, scheint ja das Item bei DATE sortierung gar nicht auf!
                curItem.setDate(dateParsed);

            // get subject element
            Element elSubject = item.getChild("subject", dcNS);
            if (elSubject != null) {
                // TODO: Mulitple subject elements not handled currently
                curItem.setSubject(elSubject.getTextTrim());
            }
            chnl.addItem(curItem);
        } //END While, end <entry>-tags

        // set to current date
        chnl.setLastUpdated(dateParsed);

        return chnl;
    }

    private static String getValue(Element elt) {
        if (elt == null)
            return "";
        String typeElt = elt.getAttributeValue("type");
        String modeElt = elt.getAttributeValue("mode");
        String value;
        String unescapedValue = elt.getTextTrim();

        if (unescapedValue == null || unescapedValue.length() == 0) {
            // maybe value in a div ?
            java.util.List listContent = elt.getContent();
            Iterator iter = listContent.iterator();
            while (iter.hasNext()) {
                org.jdom.Content content = (org.jdom.Content) iter.next();
                logger.debug("getValue : element read = " + content);
                if (content instanceof Element) {
                    unescapedValue = ((Element) content).getValue();
                    logger.debug("elt found in type :" + unescapedValue);
                }
            }
        }
        value = unescapedValue;
        if ("base64".equals(modeElt)) {
            value = ParserUtils.decodeBase64(value);
        }
        if ("text/html".equals(typeElt) || "application/xhtml+xml".equals(typeElt)) {
            value = ParserUtils.unEscape(value);
        }

        return value;
    }

}