de.nava.informa.parsers.FeedParser.java Source code

Java tutorial

Introduction

Here is the source code for de.nava.informa.parsers.FeedParser.java

Source

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//

package de.nava.informa.parsers;

import de.nava.informa.core.ChannelBuilderIF;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ParseException;
import de.nava.informa.core.UnsupportedFormatException;
import de.nava.informa.utils.NoOpEntityResolver;
import de.nava.informa.utils.ParserUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom2.*;
import org.jdom2.input.SAXBuilder;
import org.xml.sax.InputSource;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;

/**
 * Parser class which allows reading in of RSS news channels.
 * The concrete rules how the XML elements map to our channel object model
 * are delegated to version specific private classes.
 *
 * <p>Currently the FeedParser support RSS formats 0.9x, 1.0 (RDF), 2.0 and Atom 0.3.</p>
 *
 * <p>It's possible to specify class of custom XML parser to use instead of
 * standard Crimson parser (JDK). Use <code>setSaxDriverClassName()</code> method
 * for that.</p>
 *
 * @author Niko Schmuck
 */
public class FeedParser {

    private static final Log LOGGER = LogFactory.getLog(FeedParser.class);

    /**
     * Parser feed behind the given URL and build channel.
     *
     * @param cBuilder specific channel builder to use.
     * @param url      URL to use as data source.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, String url) throws IOException, ParseException {
        URL aURL = null;
        try {
            aURL = new URL(url);
        } catch (java.net.MalformedURLException e) {
            LOGGER.warn("Could not create URL for " + url);
        }
        return parse(cBuilder, new InputSource(url), aURL);
    }

    /**
     * Parser feed behind the given URL and build channel.
     *
     * @param cBuilder specific channel builder to use.
     * @param aURL     URL to use as data source.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, URL aURL) throws IOException, ParseException {
        return parse(cBuilder, new InputSource(aURL.toExternalForm()), aURL);
    }

    /**
     * Parse feed presented by Reader and build channel.
     *
     * @param cBuilder specific channel builder to use.
     * @param reader   reader setup to read feed data.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, Reader reader) throws IOException, ParseException {
        return parse(cBuilder, new InputSource(reader), null);
    }

    /**
     * Parse feed presented by InputStream and build channel.
     *
     * @param cBuilder specific channel builder to use.
     * @param stream   stream setup to read feed data.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, InputStream stream)
            throws IOException, ParseException {
        return parse(cBuilder, new InputSource(stream), null);
    }

    /**
     * Parse feed presented by file and build channel.
     *
     * @param cBuilder specific channel builder to use.
     * @param aFile    file to read data from.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, File aFile) throws IOException, ParseException {
        URL aURL;
        try {
            aURL = aFile.toURI().toURL();
        } catch (java.net.MalformedURLException e) {
            throw new IOException("File " + aFile + " had invalid URL representation.");
        }
        return parse(cBuilder, new InputSource(aURL.toExternalForm()), aURL);
    }

    /**
     * Parse feed from input source with base location set and create channel.
     *
     * @param cBuilder     specific channel builder to use.
     * @param inpSource    input source of data.
     * @param baseLocation base location of feed.
     * @return parsed channel.
     * @throws IOException    if IO errors occur.
     * @throws ParseException if parsing is not possible.
     */
    public static ChannelIF parse(ChannelBuilderIF cBuilder, InputSource inpSource, URL baseLocation)
            throws IOException, ParseException {
        // document reading without validation
        SAXBuilder saxBuilder = new SAXBuilder();

        // turn off DTD loading
        saxBuilder.setEntityResolver(new NoOpEntityResolver());

        try {
            Document doc = saxBuilder.build(inpSource);
            ChannelIF channel = parse(cBuilder, doc);
            channel.setLocation(baseLocation);
            return channel;
        } catch (JDOMException e) {
            throw new ParseException("Problem parsing " + inpSource + ": " + e);
        }
    }

    // ------------------------------------------------------------
    // internal helper methods
    // ------------------------------------------------------------

    private static synchronized ChannelIF parse(ChannelBuilderIF cBuilder, Document doc) throws ParseException {

        if (cBuilder == null) {
            throw new RuntimeException("Without builder no channel can " + "be created.");
        }
        LOGGER.debug("start parsing.");
        // Get the root element (must be rss)
        Element root = doc.getRootElement();
        String rootElement = root.getName().toLowerCase();
        // Decide which parser to use
        if (rootElement.startsWith("rss")) {
            String rssVersion = root.getAttribute("version").getValue();
            if (rssVersion.contains("0.91")) {
                LOGGER.info("Channel uses RSS root element (Version 0.91).");
                return RSS_0_91_Parser.getInstance().parse(cBuilder, root);
            } else if (rssVersion.contains("0.92")) {
                LOGGER.info("Channel uses RSS root element (Version 0.92).");
                // logger.warn("RSS 0.92 not fully supported yet, fall back to 0.91.");
                // TODO: support RSS 0.92 when aware of all subtle differences.
                return RSS_0_91_Parser.getInstance().parse(cBuilder, root);
            } else if (rootElement.contains("0.93")) {
                LOGGER.info("Channel uses RSS root element (Version 0.93).");
                LOGGER.warn("RSS 0.93 not fully supported yet, fall back to 0.91.");
                // TODO: support RSS 0.93 when aware of all subtle differences.
            } else if (rootElement.contains("0.94")) {
                LOGGER.info("Channel uses RSS root element (Version 0.94).");
                LOGGER.warn("RSS 0.94 not fully supported yet, will use RSS 2.0");
                // TODO: support RSS 0.94 when aware of all subtle differences.
                return RSS_2_0_Parser.getInstance().parse(cBuilder, root);
            } else if (rssVersion.contains("2.0") || rssVersion.equals("2")) {
                LOGGER.info("Channel uses RSS root element (Version 2.0).");
                return RSS_2_0_Parser.getInstance().parse(cBuilder, root);
            } else {
                throw new UnsupportedFormatException("Unsupported RSS version [" + rssVersion + "].");
            }
        } else if (rootElement.contains("rdf")) {
            return RSS_1_0_Parser.getInstance().parse(cBuilder, root);
        } else if (rootElement.contains("feed")) {
            Attribute versionAttr = root.getAttribute("version");
            Namespace namespace = ParserUtils.getDefaultNS(root);
            if (versionAttr != null) {
                String feedVersion = versionAttr.getValue();
                if (feedVersion.contains("0.1") || feedVersion.contains("0.2")) {
                    LOGGER.info("Channel uses feed root element (Version " + feedVersion + ").");
                    LOGGER.warn("This atom version is not really supported yet, assume Atom 0.3 format");
                    return Atom_0_3_Parser.getInstance().parse(cBuilder, root);
                } else if (feedVersion.contains("0.3")) {
                    LOGGER.info("Channel uses feed root element (Version 0.3).");
                    return Atom_0_3_Parser.getInstance().parse(cBuilder, root);
                }
            } else if (namespace != null && namespace.getURI() != null) {
                if (!namespace.getURI().equals("http://www.w3.org/2005/Atom")) {
                    LOGGER.warn("Channel uses unknown namespace in feed root element, assume Atom 1.0 format.");
                } else {
                    LOGGER.info("Channel uses feed root element (Atom 1.0 format).");
                }
                return Atom_1_0_Parser.getInstance().parse(cBuilder, root);
            }
        }

        // did not match anything
        throw new UnsupportedFormatException("Unsupported root element [" + rootElement + "].");
    }

    // ==========================================================

    public static void main(String args[]) throws IOException, ParseException {

        if (args.length < 2) {
            System.err.println("Usage: java " + FeedParser.class.getName() + " [-f <filename> | -u <url>]");
            System.exit(1);
        }

        String option = args[0];
        String data = args[1];

        ChannelBuilderIF builder = new de.nava.informa.impl.basic.ChannelBuilder();
        ChannelIF channel;
        if (option.trim().startsWith("-f")) {
            channel = FeedParser.parse(builder, new File(data));
        } else {
            channel = FeedParser.parse(builder, new URL(data));
        }

        System.out.println("Channel format: " + channel.getFormat().toString());
        System.out.println(channel);
        System.out.println("containing " + channel.getItems().size() + " items");
        for (Object item : channel.getItems()) {
            System.out.println("  - " + item);
        }
    }
}