org.paxle.crawler.ftp.impl.FtpCrawler.java Source code

Java tutorial

Introduction

Here is the source code for org.paxle.crawler.ftp.impl.FtpCrawler.java

Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.crawler.ftp.impl;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.felix.scr.annotations.Services;
import org.paxle.core.doc.ICrawlerDocument;
import org.paxle.crawler.ICrawlerContext;
import org.paxle.crawler.ICrawlerContextLocal;
import org.paxle.crawler.ICrawlerTools;
import org.paxle.crawler.ISubCrawler;
import org.paxle.crawler.ICrawlerTools.DirlistEntry;
import org.paxle.crawler.ftp.IFtpCrawler;

@Component(metatype = false, immediate = true, name = FtpCrawler.PID)
@Services({ @Service(IFtpCrawler.class), @Service(ISubCrawler.class) })
@Property(name = ISubCrawler.PROP_PROTOCOL, value = { "ftp" })
public class FtpCrawler implements IFtpCrawler {
    /* =========================================================
     * Config Properties
     * ========================================================= */
    static final String PID = "org.paxle.crawler.ftp.IFtpCrawler";

    @Property(intValue = 15000)
    static final String PROP_CONNECTION_TIMEOUT = PID + '.' + "connectionTimeout";

    @Property(intValue = 15000)
    static final String PROP_SOCKET_TIMEOUT = PID + '.' + "socketTimeout";

    @Property(intValue = 10485760)
    static final String PROP_MAXDOWNLOAD_SIZE = PID + '.' + "maxDownloadSize";

    private int connectionTimeout = 15000;
    private int socketTimeout = 15000;
    private int maxDownloadSize = 10485760;

    /**
     * For logging
     */
    private Log logger = LogFactory.getLog(this.getClass());

    @Reference
    protected ICrawlerContextLocal contextLocal;

    protected void activate(Map<String, Object> configuration) {
        // configuring timeouts
        final Integer connectionTimeout = (Integer) configuration.get(PROP_CONNECTION_TIMEOUT);
        if (connectionTimeout != null)
            this.connectionTimeout = connectionTimeout.intValue();

        final Integer socketTimeout = (Integer) configuration.get(PROP_SOCKET_TIMEOUT);
        if (socketTimeout != null)
            this.socketTimeout = socketTimeout.intValue();

        // download limit in bytes
        final Integer maxDownloadSize = (Integer) configuration.get(PROP_MAXDOWNLOAD_SIZE);
        if (maxDownloadSize != null)
            this.maxDownloadSize = maxDownloadSize.intValue();
    }

    public ICrawlerDocument request(URI requestUri) {
        if (requestUri == null)
            throw new NullPointerException("URL was null");
        this.logger.info(String.format("Crawling URL '%s' ...", requestUri));

        ICrawlerDocument crawlerDoc = null;
        try {
            final ICrawlerContext ctx = this.contextLocal.getCurrentContext();

            // creating a crawler-doc and set some basic properties
            crawlerDoc = ctx.createDocument();
            crawlerDoc.setCrawlerDate(new Date());
            crawlerDoc.setLocation(requestUri);

            FtpUrlConnection ftpConnection = new FtpUrlConnection(requestUri.toURL());
            if (this.connectionTimeout >= 0)
                ftpConnection.setConnectTimeout(this.connectionTimeout);
            if (this.socketTimeout >= 0)
                ftpConnection.setReadTimeout(this.socketTimeout);

            // connect to host
            ftpConnection.connect();

            // get the modification date of the file
            long modTimeStamp = ftpConnection.getLastModified();
            if (modTimeStamp != 0) {
                crawlerDoc.setLastModDate(new Date(modTimeStamp));
            }

            // getting content-type if available
            String contentType = ftpConnection.getContentType();
            if (contentType != null) {
                crawlerDoc.setMimeType(contentType);
            }

            // checking download size limit
            if (this.maxDownloadSize > 0) {
                int contentLength = ftpConnection.getContentLength();
                if (contentLength > this.maxDownloadSize) {
                    // reject the document
                    final String msg = String.format(
                            "Content-length '%d' of resource '%s' is larger than the max. allowed size of '%d' bytes.",
                            Integer.valueOf(contentLength), requestUri, Integer.valueOf(this.maxDownloadSize));

                    this.logger.warn(msg);
                    crawlerDoc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, msg);
                    return crawlerDoc;
                }
            }

            final ICrawlerTools crawlerTools = ctx.getCrawlerTools();
            if (ftpConnection.isDirectory()) {
                final FTPFile[] list = ftpConnection.listFiles();
                final Iterator<DirlistEntry> dirlistIt = new DirlistIterator(list);

                // generate & save dir-listing into file
                crawlerTools.saveListing(crawlerDoc, dirlistIt, true, list.length > 50);
            } else {
                // get input stream
                InputStream input = ftpConnection.getInputStream();

                // copy data into file
                crawlerTools.saveInto(crawlerDoc, input);

                // close connection
                input.close();
            }

            // finished
            crawlerDoc.setStatus(ICrawlerDocument.Status.OK);
        } catch (IOException e) {
            if (e instanceof FtpConnectionException) {
                crawlerDoc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, e.getMessage());
            } else {
                crawlerDoc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE,
                        "Unexpected Exception: " + e.getMessage());
            }

            this.logger.warn(String.format("Unexpected '%s' while trying to crawl resource '%s'.",
                    e.getClass().getName(), requestUri), e);
        } catch (URISyntaxException e) {
            this.logger.warn(
                    String.format("Unexpected URI syntax exception while converting URL->URI: %s", e.getMessage()));
        }

        return crawlerDoc;
    }

    /**
     * A wrapper class around a {@link FTPFile} which implements the methods necessary for
     * the dirlist-generation.
     */
    private static class DirlistEntryImpl implements DirlistEntry {
        FTPFile file;

        public URI getFileURI() {
            return null;
        }

        public String getFileName() {
            return file.getName();
        }

        public long getLastModified() {
            return file.getTimestamp().getTimeInMillis();
        }

        public long getSize() {
            return file.getSize();
        }
    };

    private static class DirlistIterator implements Iterator<DirlistEntry> {

        private final DirlistEntryImpl entry = new DirlistEntryImpl();
        private final FTPFile[] list;
        private int idx = 0;

        public DirlistIterator(final FTPFile[] list) {
            this.list = list;
        }

        public boolean hasNext() {
            return idx < list.length;
        }

        public DirlistEntry next() {
            entry.file = list[idx++];
            return entry;
        }

        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
}