org.roosster.input.UrlFetcher.java Source code

Java tutorial

Introduction

Here is the source code for org.roosster.input.UrlFetcher.java

Source

/*
 * This file is part of ROOSSTER.
 * Copyright 2004, Benjamin Reitzammer <benjamin@roosster.org>
 * All rights reserved.
 *
 * ROOSSTER is free software; you can redistribute it and/or modify
 * it under the terms of the Artistic License.
 *
 * You should have received a copy of the Artistic License
 * along with ROOSSTER; if not, go to
 * http://www.opensource.org/licenses/artistic-license.php for details
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 * EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.roosster.input;

import java.util.*;
import org.apache.log4j.Logger;
import java.util.logging.Level;
import java.net.URL;
import java.net.URLConnection;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;

import org.roosster.OperationException;
import org.roosster.InitializeException;
import org.roosster.Registry;
import org.roosster.Plugin;
import org.roosster.Output;
import org.roosster.Configuration;
import org.roosster.Constants;
import org.roosster.store.Entry;

/**
 *
 * @author <a href="mailto:benjamin@roosster.org">Benjamin Reitzammer</a>
 */
public class UrlFetcher implements Plugin, Constants {
    private static Logger LOG = Logger.getLogger(UrlFetcher.class.getName());

    public static final String PROP_DEF_ENC = "default.input.encoding";
    public static final String PROP_PROCESSORS = "fetcher.processors";

    private Registry registry = null;
    private String defaultEncoding = null;
    private Map processors = new Hashtable();
    private ContentTypeProcessor defaultProc = null;

    private boolean initialized = false;

    /**
     *
     */
    public void init(Registry registry) throws InitializeException {
        this.registry = registry;
        initProcessors(registry);

        defaultEncoding = registry.getConfiguration().getProperty(PROP_DEF_ENC);
        if (defaultEncoding == null)
            throw new InitializeException("Must provide default encoding via " + PROP_DEF_ENC);

        LOG.info("Initialized UrlFetcher: \ndefaultEncoding: " + defaultEncoding + "\ndefaultProcessor: "
                + defaultProc + "\nContentTypeProcessors: " + processors);

        initialized = true;
    }

    /**
     *
     */
    public boolean isInitialized() {
        return initialized;
    }

    /**
     *
     */
    public void shutdown(Registry registry) throws Exception {
        Iterator procIter = processors.values().iterator();

        while (procIter.hasNext()) {

            ContentTypeProcessor proc = null;
            try {
                proc = (ContentTypeProcessor) procIter.next();
                proc.shutdown(registry);
            } catch (Exception ex) {
                LOG.warn("Error while shutting down " + proc, ex);
            }

        }
    }

    /**
     *
     */
    public Entry[] fetch(URL[] urls) throws OperationException {
        if (urls == null)
            throw new IllegalArgumentException("No Parameter is allowed to be null!");

        List entries = new ArrayList();
        for (int i = 0; i < urls.length; i++) {

            try {
                entries.addAll(Arrays.asList(fetch(urls[i])));
            } catch (IOException ex) {
                LOG.warn("I/O Error while fetching URL " + urls[i] + ": " + ex.getMessage(), ex);
            } catch (Exception ex) {
                LOG.warn("Error while processing URL " + urls[i] + ": " + ex.getMessage(), ex);
            }

        }

        LOG.debug("Returning entries " + entries);

        return (Entry[]) entries.toArray(new Entry[0]);
    }

    // ============ private Helper methods ============

    /**
     * URLs will be fetched a second time, if the entry's lastFetched
     * object is <code>null</code>, when processed the first time.
     */
    private Entry[] fetch(URL url) throws IOException, Exception {
        LOG.debug("Opening connection to URL " + url);

        URLConnection con = url.openConnection();

        long modified = con.getLastModified();

        String embeddedContentEnc = null;

        String contentType = con.getContentType();
        if (contentType != null && contentType.indexOf(";") > -1) {
            LOG.debug("Content-type string (" + contentType + ") contains charset; strip it!");
            contentType = contentType.substring(0, contentType.indexOf(";")).trim();

            String cType = con.getContentType();
            if (cType.indexOf("=") > -1) {
                embeddedContentEnc = cType.substring(cType.indexOf("=") + 1).trim();
            }
        }

        String contentEnc = con.getContentEncoding();
        if (contentEnc == null) {
            if (embeddedContentEnc != null)
                contentEnc = embeddedContentEnc;
            else
                contentEnc = defaultEncoding;
        }

        ContentTypeProcessor proc = getProcessor(contentType);
        LOG.debug("ContentType: '" + contentType + "' - ContentEncoding: '" + contentEnc + "'");
        LOG.debug("Using Processor " + proc);

        Entry[] entries = proc.process(url, con.getInputStream(), contentEnc);

        Date modDate = new Date(modified);
        Date now = new Date();

        List returnArr = new ArrayList();
        for (int i = 0; i < entries.length; i++) {
            if (entries[i] == null)
                continue;

            URL entryUrl = entries[i].getUrl();

            String title = entries[i].getTitle();
            if (title == null || "".equals(title))
                entries[i].setTitle(entryUrl.toString());

            if (entries[i].getModified() == null)
                entries[i].setModified(modDate);

            if (entries[i].getIssued() == null)
                entries[i].setIssued(modDate);

            if (entries[i].getAdded() == null)
                entries[i].setAdded(now);

            String fileType = entries[i].getFileType();
            if (fileType == null || "".equals(fileType)) {

                int dotIndex = entryUrl.getPath().lastIndexOf(".");
                if (dotIndex != -1) {
                    String type = entryUrl.getPath().substring(dotIndex + 1);
                    entries[i].setFileType(type.toLowerCase());
                    LOG.debug("Filetype is subsequently set to '" + type + "'");
                }

            }

            returnArr.add(entries[i]);
            entries[i] = null;
        }

        return (Entry[]) returnArr.toArray(new Entry[0]);
    }

    /**
     *
     */
    private void initProcessors(Registry registry) throws InitializeException {
        Configuration conf = registry.getConfiguration();
        String procNames = conf.getProperty(PROP_PROCESSORS);

        if (procNames == null)
            throw new InitializeException("UrlFetcher needs ContentTypeProcessors");

        String defProcName = conf.getProperty(PROP_PROCESSORS + ".default");
        if (defProcName == null || "".equals(defProcName))
            throw new InitializeException("No default processor defined");

        StringTokenizer tok = new StringTokenizer(procNames.trim(), " ");
        while (tok.hasMoreTokens()) {
            String name = tok.nextToken();
            String clazz = conf.getProperty(PROP_PROCESSORS + "." + name + ".class");
            String typeStr = conf.getProperty(PROP_PROCESSORS + "." + name + ".type");

            if (clazz == null || typeStr == null) {
                LOG.warn("No Class or Type property defined for processor '" + name + "'");
                continue;
            }

            // split types by spaces, to allow single proc to
            // process multiple types
            List types = new ArrayList();
            StringTokenizer typeTok = new StringTokenizer(typeStr);
            while (typeTok.hasMoreTokens()) {
                types.add(typeTok.nextToken());
            }

            try {
                LOG.debug("Trying to load ContentTypeProcessor " + clazz);

                ContentTypeProcessor proc = (ContentTypeProcessor) Class.forName(clazz).newInstance();
                proc.init(registry);

                for (int i = 0; i < types.size(); i++) {
                    processors.put(types.get(i), proc);
                }

                if (defProcName.equals(name))
                    defaultProc = proc;

            } catch (ClassCastException ex) {
                LOG.warn("Processor " + name + " does not implement the " + ContentTypeProcessor.class
                        + " interface", ex);
                throw new InitializeException(ex);

            } catch (Exception ex) {
                LOG.warn("Error while loading processor " + name + " ; Message: " + ex.getMessage(), ex);

                throw new InitializeException(ex);
            }
        }

        if (defaultProc == null)
            throw new InitializeException("Invalid default processor defined (misspelled class?)");

    }

    /**
     *
     */
    private ContentTypeProcessor getProcessor(String contentType) {
        ContentTypeProcessor proc = (ContentTypeProcessor) processors.get(contentType);
        if (proc == null)
            return defaultProc;
        else
            return proc;
    }
}