org.jab.docsearch.utils.NetUtils.java Source code

Introduction

Here is the source code for org.jab.docsearch.utils.NetUtils.java
Source

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jab.docsearch.utils;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.jab.docsearch.spider.SpiderUrl;

/**
 * This class contains useful methods for network
 *
 * @version $Id: NetUtils.java 163 2011-01-06 21:37:42Z henschel $
 */
public final class NetUtils {
    /**
     * Log4J
     */
    private final Logger logger = Logger.getLogger(NetUtils.class.getName());
    /**
     * DocSearcher user agent
     */
    private final String USER_AGENT = "DocSearcher " + I18n.getString("ds.version");

    /**
     * Gets URL size (content)
     *
     * @param url  URL for connect
     * @return     size in bytes of a url or 0 if broken or timed out connection
     */
    public long getURLSize(final String url) {
        try {
            URL tmpURL = new URL(url);
            URLConnection conn = tmpURL.openConnection();

            // set connection parameter
            conn.setDoInput(true);
            conn.setDoOutput(false);
            conn.setUseCaches(false);
            conn.setRequestProperty("User-Agent", USER_AGENT);

            // connect
            conn.connect();

            long contentLength = conn.getContentLength();

            if (logger.isDebugEnabled()) {
                logger.debug("getURLSize() content lentgh=" + contentLength + " of URL='" + url + "'");
            }

            return contentLength;
        } catch (IOException ioe) {
            logger.error("getURLSize() failed for URL='" + url + "'", ioe);
            return 0;
        }
    }

    /**
     * Gets URL modified date as long
     *
     * @param url  URL to connect
     * @return         date of URLs modification or 0 if an error occurs
     */
    public long getURLModifiedDate(final String url) {
        try {
            URL tmpURL = new URL(url);
            URLConnection conn = tmpURL.openConnection();

            // set connection parameter
            conn.setDoInput(true);
            conn.setDoOutput(false);
            conn.setUseCaches(false);
            conn.setRequestProperty("User-Agent", USER_AGENT);

            // connect
            conn.connect();

            long modifiedDate = conn.getLastModified();

            if (logger.isDebugEnabled()) {
                logger.debug("getURLModifiedDate() modified date=" + modifiedDate + " of URL='" + url + "'");
            }

            return modifiedDate;
        } catch (IOException ioe) {
            logger.error("getURLModifiedDate() failed for URL='" + url + "'", ioe);
            return 0;
        }
    }

    /**
     * Downloads URL to file
     *
     * Content type, content length, last modified and md5 will be set in SpiderUrl
     *
     * @param spiderUrl  URL to download
     * @param file       URL content downloads to this file
     * @return           true if URL successfully downloads to a file
     */
    public boolean downloadURLToFile(final SpiderUrl spiderUrl, final String file) {

        boolean downloadOk = false;
        BufferedInputStream inputStream = null;
        BufferedOutputStream outputStream = null;
        try {
            // if the file downloads - save it and return true
            URL url = new URL(spiderUrl.getUrl());
            URLConnection conn = url.openConnection();

            // set connection parameter
            conn.setDoInput(true);
            conn.setDoOutput(false);
            conn.setUseCaches(false);
            conn.setRequestProperty("User-Agent", USER_AGENT);

            // connect
            conn.connect();

            // content type
            spiderUrl.setContentType(getContentType(conn));

            // open streams
            inputStream = new BufferedInputStream(conn.getInputStream());
            outputStream = new BufferedOutputStream(new FileOutputStream(file));

            // copy data from URL to file
            long size = 0;
            int readed = 0;
            while ((readed = inputStream.read()) != -1) {
                size++;
                outputStream.write(readed);
            }

            // set values
            spiderUrl.setContentType(getContentType(conn));
            spiderUrl.setSize(size);

            downloadOk = true;
        } catch (IOException ioe) {
            logger.fatal("downloadURLToFile() failed for URL='" + spiderUrl.getUrl() + "'", ioe);
            downloadOk = false;
        } finally {
            IOUtils.closeQuietly(outputStream);
            IOUtils.closeQuietly(inputStream);
        }

        // set values
        if (downloadOk) {
            spiderUrl.setMd5(FileUtils.getMD5Sum(file));
        }

        return downloadOk;
    }

    /**
     * Downloads URL to file
     *
     * @param conn  URL Connection
     * @param file  URL content downloads to this file
     * @return      true if URL successfully downloads to a file
     */
    public boolean downloadURLToFile(URLConnection conn, final String file) {

        BufferedInputStream inputStream = null;
        BufferedOutputStream outputStream = null;
        try {
            // open streams
            inputStream = new BufferedInputStream(conn.getInputStream());
            outputStream = new BufferedOutputStream(new FileOutputStream(file));

            // copy data from URL to file
            int readed = 0;
            while ((readed = inputStream.read()) != -1) {
                outputStream.write(readed);
            }

            return true;
        } catch (IOException ioe) {
            logger.fatal("downloadURLToFile() failed to download", ioe);
            return false;
        } finally {
            IOUtils.closeQuietly(outputStream);
            IOUtils.closeQuietly(inputStream);
        }
    }

    /**
     * Get SpiderUrl status
     *
     * Content type, content length, last modified and md5 will be set in SpiderUrl if url is changed
     *
     * @param spiderUrl  URL to check
     * @param file       URL content downloads to to this file
     * @return           -1 if link is broken, 0 if the file is unchanged or 1 if the file
     *                   is different... part of caching algoritm.
     */
    public int getURLStatus(final SpiderUrl spiderUrl, final String file) {
        // -1 means broken link
        //  0 means same file
        //  1 means changed

        int status;

        try {
            // attempt to obtain status from date
            URL url = new URL(spiderUrl.getUrl());
            URLConnection conn = url.openConnection();

            // set connection parameter
            conn.setDoInput(true);
            conn.setDoOutput(false);
            conn.setUseCaches(false);
            conn.setRequestProperty("User-Agent", USER_AGENT);

            // connect
            conn.connect();

            // content type
            spiderUrl.setContentType(getContentType(conn));

            // check date
            long spiDate = spiderUrl.getLastModified();
            long urlDate = conn.getLastModified();

            // same if date is equal and not zero
            if (spiDate == urlDate && urlDate != 0) {
                // the file is not changed
                status = 0;
            } else {
                // download the URL and compare hashes
                boolean downloaded = downloadURLToFile(conn, file);
                if (downloaded) {
                    // download ok

                    // compare file hashes
                    String fileHash = FileUtils.getMD5Sum(file);

                    if (fileHash.equals(spiderUrl.getMd5())) {
                        // same
                        status = 0;
                    } else {
                        // changed
                        status = 1;

                        // set changed values
                        spiderUrl.setSize(FileUtils.getFileSize(file));
                        spiderUrl.setLastModified(urlDate);
                        spiderUrl.setMd5(fileHash);
                    }
                } else {
                    // download failed

                    // broken link
                    status = -1;
                }
            }
        } catch (IOException ioe) {
            logger.error("getURLStatus() failed for URL='" + spiderUrl.getUrl() + "'", ioe);
            status = -1;
        }

        return status;
    }

    /**
     * Gets content type from connection and set default if null
     *
     * @param con  URL connection
     * @return     Content type
     */
    public String getContentType(final URLConnection con) {
        String contentType = con.getContentType();

        if (contentType != null) {
            logger.debug("getContentType() content type is '" + contentType + "'");
        } else {
            logger.debug("getContentType() Null content type - assuming html anyway.");
            // FIXME check this, if we need this default value!!!
            contentType = "html";
        }

        return contentType;
    }
}