org.bibsonomy.util.WebUtils.java Source code

Introduction

Here is the source code for org.bibsonomy.util.WebUtils.java
Source

/**
 *
 *  BibSonomy-Web-Common - A blue social bookmark and publication sharing system.
 *
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *                            http://www.kde.cs.uni-kassel.de/
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.bibsonomy.util;

import static org.bibsonomy.util.ValidationUtils.present;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringReader;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * @author rja
 * @version $Id: WebUtils.java,v 1.2 2011-01-29 07:34:45 rja Exp $
 */
public class WebUtils {
    private static final Log log = LogFactory.getLog(WebUtils.class);

    /**
     * maximal number of redirects to follow in {@link #getRedirectUrl(URL)}
     */
    private static final int MAX_REDIRECT_COUNT = 10;

    /**
     * The user agent used for all requests with {@link HttpURLConnection}.
     */
    private static final String USER_AGENT_PROPERTY_VALUE = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)";

    private static final String CHARSET = "charset=";
    private static final String DEFAULT_CHARSET = "UTF-8";
    private static final String LOCATION = "Location";
    private static final String EQUAL_SIGN = "=";
    private static final String AMP_SIGN = "&";
    private static final String NEWLINE = "\n";
    private static final String SEMICOLON = ";";
    private static final String USER_AGENT_HEADER_NAME = "User-Agent";
    private static final String COOKIE_HEADER_NAME = "Cookie";
    private static final String CONTENT_TYPE_HEADER_NAME = "Content-Type";

    /*
     * The maximum number of characters (~bytes) to read from a HTTP connection.
     * We fixed this to 1 MB to avoid that radio streams or huge files
     * mess up our heap. If this is not enough, please increase the size
     * carefully.
     */
    private static final int MAX_CONTENT_LENGTH = 1 * 1024 * 1024;

    /*
     * according to http://hc.apache.org/httpclient-3.x/threading.html
     * HttpClient is thread safe and we can use one instance for several requests.
     */
    private static final MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
    private static final HttpClient client = new HttpClient(connectionManager);
    static {
        client.getParams().setParameter(HttpMethodParams.USER_AGENT, USER_AGENT_PROPERTY_VALUE);
    }

    /**
     * Do a POST request to the given URL with the given content. Assume the charset of the result to be charset.
     * 
     * @param url
     * @param postContent
     * @param charset - the assumed charset of the result. If <code>null</code>, the charset from the response header is used.
     * @param cookie - the Cookie to be attached to the request. If <code>null</code>, the Cookie header is not set.
     * @return The content of the result page.
     * 
     * @throws IOException
     * 
     * @Deprecated
     */
    public static String getPostContentAsString(final URL url, final String postContent, final String charset,
            final String cookie) throws IOException {
        final HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
        urlConn.setAllowUserInteraction(false);
        urlConn.setDoInput(true);
        urlConn.setDoOutput(true);
        urlConn.setUseCaches(false);
        urlConn.setRequestMethod("POST");
        urlConn.setRequestProperty(CONTENT_TYPE_HEADER_NAME, "application/x-www-form-urlencoded");

        /*
         * set user agent (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) since some 
         * pages require it to download content.
         */
        urlConn.setRequestProperty(USER_AGENT_HEADER_NAME, USER_AGENT_PROPERTY_VALUE);

        if (cookie != null) {
            urlConn.setRequestProperty(COOKIE_HEADER_NAME, cookie);
        }

        writeStringToStream(postContent, urlConn.getOutputStream());

        // connect
        urlConn.connect();

        /*
         * extract character encoding from header
         */
        final String activeCharset;
        if (charset == null) {
            activeCharset = getCharset(urlConn);
        } else {
            activeCharset = charset;
        }

        /*
         * FIXME: check content type header to ensure that we only read textual 
         * content (and not a PDF, radio stream or DVD image ...)
         */

        // write into string writer
        final StringBuilder out = inputStreamToStringBuilder(urlConn.getInputStream(), activeCharset);

        // disconnect
        urlConn.disconnect();

        return out.toString();
    }

    /**
     * Do a POST request to the given URL with the given content. Assume the charset of the result to be charset.
     * 
     * @param url
     * @param postContent
     * @param charset - the assumed charset of the result. If <code>null</code>, the charset from the response header is used.
     * @return The content of the result page.
     * 
     * @throws IOException
     * 
     * @Deprecated
     */
    public static String getPostContentAsString(final URL url, final String postContent, final String charset)
            throws IOException {
        return getPostContentAsString(url, postContent, charset, null);
    }

    /**
     * Do a POST request to the given URL with the given content and cookie. Assume the charset of the result to be charset.
     * 
     * @param url
     * @param postContent
     * @param cookie
     * @return The content of the result page.
     * 
     * @throws IOException
     * 
     * @Deprecated
     */
    public static String getPostContentAsString(final String cookie, final URL url, final String postContent)
            throws IOException {
        return getPostContentAsString(url, postContent, null, cookie);
    }

    /**
     * Do a POST request to the given URL with the given content.
     * 
     * @param url
     * @param postContent
     * @return The content of the result page.
     * 
     * @throws IOException
     * 
     * @Deprecated
     */
    public static String getPostContentAsString(final URL url, final String postContent) throws IOException {
        return getPostContentAsString(url, postContent, null, null);
    }

    /**
     * Reads from a URL and writes the content into a string.
     * 
     * @param inputURL the URL of the content.
     * @param cookie a cookie which should be included in the header of the request send to the server
     * @return String which holds the page content.
     * @throws IOException 
     * 
     * @Deprecated
     */
    public static String getContentAsString(final URL inputURL, final String cookie) throws IOException {
        try {
            final HttpURLConnection urlConn = (HttpURLConnection) inputURL.openConnection();
            urlConn.setAllowUserInteraction(false);
            urlConn.setDoInput(true);
            urlConn.setDoOutput(false);
            urlConn.setUseCaches(false);

            /*
             * set user agent (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) since some 
             * pages require it to download content.
             */
            urlConn.setRequestProperty(USER_AGENT_HEADER_NAME, USER_AGENT_PROPERTY_VALUE);
            if (cookie != null) {
                urlConn.setRequestProperty(COOKIE_HEADER_NAME, cookie);
            }
            urlConn.connect();

            /*
             * extract character encoding from header
             */
            final String charSet = getCharset(urlConn);

            /*
             * FIXME: check content type header to ensure that we only read textual 
             * content (and not a PDF, radio stream or DVD image ...)
             */

            /*
             * write content into string buffer
             */
            final StringBuilder out = inputStreamToStringBuilder(urlConn.getInputStream(), charSet);

            urlConn.disconnect();

            return out.toString();
        } catch (final ConnectException cex) {
            log.debug("Could not get content for URL " + inputURL.toString() + " : " + cex.getMessage());
            throw new IOException(cex);
        } catch (final IOException ioe) {
            log.debug("Could not get content for URL " + inputURL.toString() + " : " + ioe.getMessage());
            throw ioe;
        }
    }

    /**
     * Reads from a URL and writes the content into a string.
     * 
     * @param inputURL the URL of the content.
     * @return String which holds the page content.
     * @throws IOException
     * 
     * @Deprecated
     */
    public static String getContentAsString(final URL inputURL) throws IOException {
        return getContentAsString(inputURL, null);
    }

    /**
     * Reads from a URL and writes the content into a string.
     * 
     * @param url
     * @param cookie
     * @param postData 
     * @param visitBefore
     * 
     * @return String which holds the page content.
     * 
     * @throws IOException
     */
    public static String getContentAsString(final String url, final String cookie, final String postData,
            final String visitBefore) throws IOException {
        if (present(visitBefore)) {
            /*
             * visit URL to get cookies if needed
             */
            client.executeMethod(new GetMethod(visitBefore));
        }

        final HttpMethod method;
        if (present(postData)) {
            /*
             * do a POST request
             */
            final List<NameValuePair> data = new ArrayList<NameValuePair>();

            for (final String s : postData.split(AMP_SIGN)) {
                final String[] p = s.split(EQUAL_SIGN);

                if (p.length != 2) {
                    continue;
                }

                data.add(new NameValuePair(p[0], p[1]));
            }

            method = new PostMethod(url);
            ((PostMethod) method).setRequestBody(data.toArray(new NameValuePair[data.size()]));
        } else {
            /*
             * do a GET request
             */
            method = new GetMethod(url);
            method.setFollowRedirects(true);
        }

        /*
         * set cookie
         */
        if (present(cookie)) {
            method.setRequestHeader(COOKIE_HEADER_NAME, cookie);
        }

        /*
         * do request
         */
        final int status = client.executeMethod(method);
        if (status != HttpStatus.SC_OK) {
            throw new IOException(url + " returns: " + status);
        }

        /*
         * FIXME: check content type header to ensure that we only read textual 
         * content (and not a PDF, radio stream or DVD image ...)
         */

        /*
         * collect response
         */
        final String charset = extractCharset(method.getResponseHeader(CONTENT_TYPE_HEADER_NAME).getValue());
        final StringBuilder content = inputStreamToStringBuilder(method.getResponseBodyAsStream(), charset);
        method.releaseConnection();

        final String string = content.toString();
        if (string.length() > 0) {
            return string;
        }

        return null;

    }

    /**
     * Reads from a URL and writes the content into a string.
     * 
     * @param url
     * @return String which holds the page content.
     * @throws IOException
     */
    public static String getContentAsString(final String url) throws IOException {
        return getContentAsString(url, null, null, null);
    }

    /**
     * Reads from a URL and writes the content into a string.
     * 
     * @param url
     * @param cookie 
     * @return String which holds the page content.
     * @throws IOException
     */
    public static String getContentAsString(final String url, final String cookie) throws IOException {
        return getContentAsString(url, cookie, null, null);
    }

    /**
     * Sends a request to the given URL and checks, if it contains a redirect.
     * If it does, returns the redirect URL. Otherwise, returns null.
     * This is done up to {@value #MAX_REDIRECT_COUNT}-times until the final page is reached. 
     *  
     * 
     * 
     * @param url
     * @return - The redirect URL.
     */
    public static URL getRedirectUrl(final URL url) {
        try {
            URL internalUrl = url;
            URL previousUrl = null;
            int redirectCtr = 0;
            while (internalUrl != null && redirectCtr < MAX_REDIRECT_COUNT) {
                redirectCtr++;

                final HttpURLConnection urlConn = (HttpURLConnection) internalUrl.openConnection();

                urlConn.setAllowUserInteraction(false);
                urlConn.setDoInput(true);
                urlConn.setDoOutput(false);
                urlConn.setUseCaches(false);
                urlConn.setInstanceFollowRedirects(false);

                /*
                 * set user agent (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) since some 
                 * pages require it to download content.
                 */
                urlConn.setRequestProperty(USER_AGENT_HEADER_NAME, USER_AGENT_PROPERTY_VALUE);

                urlConn.connect();

                // get URL to redirected resource
                previousUrl = internalUrl;
                try {
                    internalUrl = new URL(urlConn.getHeaderField(LOCATION));
                } catch (final MalformedURLException e) {
                    internalUrl = null;
                }

                urlConn.disconnect();

            }

            return previousUrl;
        } catch (final IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns the cookies returned by the server on accessing the URL. 
     * The format of the returned cookies is as
     * 
     * 
     * @param url
     * @return The cookies as string, build by {@link #buildCookieString(List)}.
     * @throws IOException
     */
    public static String getCookies(final URL url) throws IOException {
        final HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();

        urlConn.setAllowUserInteraction(false);
        urlConn.setDoInput(true);
        urlConn.setDoOutput(false);
        urlConn.setUseCaches(false);

        urlConn.setRequestProperty(USER_AGENT_HEADER_NAME, USER_AGENT_PROPERTY_VALUE);

        urlConn.connect();

        final List<String> cookies = urlConn.getHeaderFields().get("Set-Cookie");
        urlConn.disconnect();

        return buildCookieString(cookies);
    }

    /**
     * Builds a cookie string as used in the HTTP header.
     * 
     * @param cookies - a list of key/value pairs
     * @return The cookies folded into a string.
     */
    public static String buildCookieString(final List<String> cookies) {
        final StringBuffer result = new StringBuffer();

        if (cookies != null) {
            for (final String cookie : cookies) {
                if (result.length() != 0)
                    result.append(";");
                result.append(cookie);
            }
        }
        return result.toString();
    }

    /** Extracts the charset ID of a web page as returned by the server.
     * 
     * @param urlConn
     * @return
     */
    private static String getCharset(final HttpURLConnection urlConn) {
        return extractCharset(urlConn.getContentType());
    }

    /**
     * Extracts the charset from the given string. The string should resemble
     * the content type header of an HTTP request. Valid examples are:
     * <ul>
     * <li>text/html; charset=utf-8; qs=1</li>
     * <li>
     * </ul>
     *
     * @param contentType
     * @return - The charset.
     */
    public static String extractCharset(final String contentType) {
        /*
         * this typically looks like that:
         * text/html; charset=utf-8; qs=1
         */
        if (present(contentType)) {
            final int charsetPosition = contentType.indexOf(CHARSET);
            if (charsetPosition > -1) {
                /*
                 * cut this:
                 *                    |<--   -->|             
                 * text/html; charset=utf-8; qs=1
                 */
                String charSet = contentType.substring(charsetPosition + CHARSET.length());

                // get only charset
                final int charsetEnding = charSet.indexOf(SEMICOLON);
                if (charsetEnding > -1) {
                    /*
                     * cut this:
                     * |<->|             
                     * utf-8; qs=1
                     */
                    charSet = charSet.substring(0, charsetEnding);
                }
                return charSet.trim().toUpperCase();
            }
        }
        /*
         * default charset
         */
        return DEFAULT_CHARSET;
    }

    /** Copies the stream into the string builder.
     * 
     * @param inputStream
     * @return
     * @throws IOException
     */
    private static StringBuilder inputStreamToStringBuilder(final InputStream inputStream, final String charset)
            throws IOException {
        final InputStreamReader in;
        /*
         * set charset
         */
        if (charset == null || charset.trim().equals(""))
            in = new InputStreamReader(inputStream);
        else
            in = new InputStreamReader(inputStream, charset);
        /*
         * use buffered reader (we always assume to have text)
         */
        final BufferedReader buf = new BufferedReader(in);
        final StringBuilder sb = new StringBuilder();
        String line = null;
        while ((line = buf.readLine()) != null && sb.length() + line.length() < MAX_CONTENT_LENGTH) {
            sb.append(line).append(NEWLINE);
        }
        buf.close();

        return sb;
    }

    /** Writes the given string to the stream.
     * 
     * @param s
     * @param outputStream
     * @throws IOException
     */
    private static void writeStringToStream(final String s, final OutputStream outputStream) throws IOException {
        final StringReader reader = new StringReader(s);
        int b;
        while ((b = reader.read()) >= 0) {
            outputStream.write(b);
        }
        outputStream.flush();
    }
}