de.nava.informa.utils.FormatDetector.java Source code

Java tutorial

Introduction

Here is the source code for de.nava.informa.utils.FormatDetector.java

Source

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//

package de.nava.informa.utils;

import de.nava.informa.core.ChannelFormat;
import de.nava.informa.core.UnsupportedFormatException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

/**
 * Utility class for analysing the news channel syntax and mapping to
 * known format to ease further processing.
 *
 * @author Niko Schmuck
 */
public final class FormatDetector {

    private static Log logger = LogFactory.getLog(FormatDetector.class);

    private static final int NR_FIRST_BYTES = 2048;

    /**
     * Guess the format of the specified news channel. For performance
     * reason it is wise to minimize the number of format guesses.
     *
     * @param url a url to the news channel.
     * @return The news channel synatx format, currently only RSS 0.91
     * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
     * and RSS/RDF 1.0
     * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
     * are recognized.
     * @throws UnsupportedFormatException in case a news channel format
     *                                    could not be guessed.
     * @throws IOException                if the given url cannot be read in.
     */
    public static ChannelFormat getFormat(URL url) throws IOException, UnsupportedFormatException {

        logger.info("Trying to retrieve stream from " + url);
        BufferedInputStream in = new BufferedInputStream(url.openStream(), NR_FIRST_BYTES);
        return getFormat(in);
    }

    /**
     * Guess the format of the specified news channel. For performance
     * reason it is wise to minimize the number of format guesses.
     *
     * @param in an InputStream to the news channel.
     * @return The news channel synatx format, currently only RSS 0.91
     * ({@link de.nava.informa.core.ChannelFormat#RSS_0_91})
     * and RSS/RDF 1.0
     * ({@link de.nava.informa.core.ChannelFormat#RSS_1_0})
     * are recognized.
     * @throws UnsupportedFormatException in case a news channel format
     *                                    could not be guessed.
     * @throws IOException                if the given url cannot be read in.
     */
    public static ChannelFormat getFormat(InputStream in) throws IOException, UnsupportedFormatException {

        byte[] b = new byte[NR_FIRST_BYTES];

        int bytesRead = 0;
        while (bytesRead < NR_FIRST_BYTES) {
            int bytes = in.read(b, bytesRead, NR_FIRST_BYTES - bytesRead);
            if (bytes == -1)
                break;
            bytesRead += bytes;
        }

        String rootElement = getRootElement(b);
        logger.debug("Detected [" + rootElement + "].");
        if (rootElement.startsWith("rss")) {
            if (rootElement.indexOf("0.91") > 0) {
                logger.info("Channel uses RSS root element (Version 0.91).");
                return ChannelFormat.RSS_0_91;
            } else if (rootElement.indexOf("0.92") > 0) {
                logger.info("Channel uses RSS root element (Version 0.92).");
                // FIXME: should really return ChannelFormat.RSS_0_92
                // when aware of all subtle differences.
                return ChannelFormat.RSS_0_92;
            } else if (rootElement.indexOf("0.93") > 0) {
                logger.info("Channel uses RSS root element (Version 0.93).");
                logger.warn("RSS 0.93 not fully supported yet, fall back to 0.92.");
                // FIXME: should really return ChannelFormat.RSS_0_93
                // when aware of all subtle differences.
                return ChannelFormat.RSS_0_92;
            } else if (rootElement.indexOf("0.94") > 0) {
                logger.info("Channel uses RSS root element (Version 0.94).");
                logger.warn("RSS 0.94 not fully supported yet, fall back to 0.92.");
                // FIXME: should really return ChannelFormat.RSS_0_94
                // when aware of all subtle differences.
                return ChannelFormat.RSS_0_92;
            } else if (rootElement.indexOf("2.0") > 0) {
                logger.info("Channel uses RSS root element (Version 2.0).");
                return ChannelFormat.RSS_2_0;
            } else {
                throw new UnsupportedFormatException("Unsupported RSS version [" + rootElement + "].");
            }
        } else if (rootElement.contains("rdf")) {
            logger.info("Channel uses RDF root element.");
            return ChannelFormat.RSS_1_0;
        } else if (rootElement.contains("feed")) {
            if (rootElement.contains("0.1")) {
                return ChannelFormat.ATOM_0_1;
            } else if (rootElement.contains("0.2")) {
                return ChannelFormat.ATOM_0_2;
            } else if (rootElement.contains("0.3")) {
                return ChannelFormat.ATOM_0_3;
            } else {
                throw new UnsupportedFormatException("Unsupported ATOM version [" + rootElement + "].");
            }
        } else {
            throw new UnsupportedFormatException(
                    "Not able to parse document " + "with root element [" + rootElement + "].");
        }
    }

    /**
     * Gets the name of the root element and the attributes (inclusive
     * namespace declarations).
     */
    private static String getRootElement(byte[] b) {
        String s = new String(b);
        int startPos = 0;
        int endPos = 0;
        boolean inComment = false;
        for (int i = 0; i < s.length(); i++) {
            if (s.charAt(i) == '<' && Character.isLetter(s.charAt(i + 1)) && !inComment) {
                startPos = i + 1;
                for (int j = i + 1; j < s.length(); j++) {
                    if (s.charAt(j) == '>') {
                        endPos = j;
                        break;
                    }
                }
                break;
            } else if (!inComment && s.charAt(i) == '<' && s.charAt(i + 1) == '!' && s.charAt(i + 2) == '-'
                    && s.charAt(i + 3) == '-')
                inComment = true;
            else if (inComment && s.charAt(i) == '-' && s.charAt(i + 1) == '-' && s.charAt(i + 2) == '>')
                inComment = false;
        } // for i
        if (startPos >= 0 && endPos >= 0 && endPos > startPos) {
            return s.substring(startPos, endPos);
        } else {
            throw new IllegalArgumentException("Unable to retrieve root " + "element from " + s);
        }
    }

}