org.archive.wayback.liveweb.StdRemoteLiveWebCache.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.wayback.liveweb.StdRemoteLiveWebCache.java

Source

/*
 * Copyright 2014 Bibliotheca Alexandrina.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.archive.wayback.liveweb;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.logging.Logger;
import org.apache.commons.httpclient.ConnectTimeoutException;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NoHttpResponseException;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.archive.io.arc.ARCRecord;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.LiveWebCacheUnavailableException;
import org.archive.wayback.exception.LiveWebTimeoutException;
import org.archive.wayback.exception.ResourceNotAvailableException;
import org.archive.wayback.resourcestore.resourcefile.ArcResource;
import org.archive.wayback.resourcestore.resourcefile.ResourceFactory;

/**
 * This class fetches resource from live web. 
 * It works with standard proxy server e.g. Squid.
 * 
 * @author Mohamed Elsayed
 * @see LiveWebCache
 * @see ArcRemoteLiveWebCache
 */
public class StdRemoteLiveWebCache implements LiveWebCache {
    private static final Logger LOGGER = Logger.getLogger(StdRemoteLiveWebCache.class.getName());

    protected MultiThreadedHttpConnectionManager connectionManager;
    protected HostConfiguration hostConfiguration;
    protected HttpClient httpClient;
    protected String requestPrefix;
    private CloseableHttpResponse response;
    private ArcResource ar;

    /**
     * StdRemoteLiveWebCache constructor initializes and configures connection objects.
     */
    public StdRemoteLiveWebCache() {
        connectionManager = new MultiThreadedHttpConnectionManager();
        hostConfiguration = new HostConfiguration();
        HttpClientParams params = new HttpClientParams();
        params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler());
        httpClient = new HttpClient(params, connectionManager);
        httpClient.setHostConfiguration(hostConfiguration);
    }

    /**
     * Gets resource object from the live web. Configure timeout to 10 seconds.
     *  
     * @param url to fetch from the live web.
     * @param maxCacheMS maximum age of resource to return - optionally honored
     * @param bUseOlder if true, return documents older than maxCacheMS if
     *                  a more recent copy is not available.
     * 
     * @return Resource for url
     * 
     * @throws LiveDocumentNotAvailableException if the resource cannot be
     *         retrieved from the live web, but all proxying and caching 
     *         mechanisms functioned properly
     * @throws LiveWebCacheUnavailableException if there was a problem either
     *          accessing the live web, in proxying to the live web, or in
     *          maintaining the cache for the live web
     * @throws LiveWebTimeoutException if there is no response from the live
     *          web cache before a timeout occurred.
     * @throws IOException for the usual reasons
     * 
     * @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean)
     * @inheritDoc org.archive.wayback.liveweb.LiveWebCache#getCachedResource
     */
    @Override
    public Resource getCachedResource(URL url, long maxCacheMS, boolean bUseOlder)
            throws LiveDocumentNotAvailableException, LiveWebCacheUnavailableException, LiveWebTimeoutException,
            IOException {
        String urlStr = url.toExternalForm();

        if (requestPrefix != null)
            urlStr = requestPrefix + urlStr;

        HttpHost proxy = new HttpHost(hostConfiguration.getProxyHost(), hostConfiguration.getProxyPort());

        // Set socketTimeout and connectionTimeout to 10 seconds.
        RequestConfig reqConf = RequestConfig.custom().setProxy(proxy).setSocketTimeout(10000)
                .setConnectTimeout(10000).setConnectionRequestTimeout(10000).build();
        CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(reqConf).build();
        HttpGet httpGet = new HttpGet(urlStr);

        try {
            // The following line gets robots.txt from live web
            response = httpclient.execute(httpGet);

            String httpHeaderStr = "";
            String bodyStr = "";

            /* If it fails to get robots.txt (http status code is 404),
               then display contents and don't throw exception
               (socketTimeOutException or connectTimeOutException)
            */
            if (response.getStatusLine().getStatusCode() == 404) {
                httpHeaderStr = "HTTP/1.0 200 OK\n";
                bodyStr = String.format("%s\n%s\n", "User-agent: *", "Allow: /");
            } else if (response.getStatusLine().getStatusCode() == 200) {
                // The following line represents first line in http header
                httpHeaderStr = String.format("%s %d %s\n", response.getStatusLine().getProtocolVersion(),
                        response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());

                // Get robots.txt contents and store it into bodyStr
                HttpEntity entity = response.getEntity();
                bodyStr = EntityUtils.toString(entity);
            }

            // Get Http Header and store complete http header in httpHeaderStr
            for (Header header : response.getAllHeaders())
                httpHeaderStr += header.toString() + "\n";

            httpHeaderStr += "\n";
            int length = httpHeaderStr.length() + bodyStr.length();

            /*
            Using httpHeaderStr and bodyStr to construct responseStr.
            First line in responseStr should exist.
            */

            // TODO: the following line should be enhanced, 
            //       especially the first line in responseStr. 
            String responseStr = String.format("%s %s %d\n%s%s", urlStr, "0.0.0.0 10000000000000 text/plain",
                    length, httpHeaderStr, bodyStr);

            ByteArrayInputStream bais = new ByteArrayInputStream(responseStr.getBytes());

            // TODO: Should not use ARCRecord
            ARCRecord r = new ARCRecord(bais, "id", 0L, false, false, true);
            ar = (ArcResource) ResourceFactory.ARCArchiveRecordToResource(r, null);

            if (ar.getStatusCode() == 502) {
                throw new LiveDocumentNotAvailableException(urlStr);
            } else if (ar.getStatusCode() == 504) {
                throw new LiveWebTimeoutException("Timeout:" + urlStr);
            }

            return ar;
        } catch (ResourceNotAvailableException e) {
            throw new LiveDocumentNotAvailableException(urlStr);
        } catch (NoHttpResponseException e) {
            throw new LiveWebCacheUnavailableException("No Http Response for " + urlStr);
        } catch (ConnectException e) {
            throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlStr);
        } catch (SocketException e) {
            throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlStr);
        } catch (SocketTimeoutException e) {
            throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlStr);
        } catch (ConnectTimeoutException e) {
            throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlStr);
        } finally {
            response.close();
        }
    }

    /**
     * Sets proxy and port (proxy:port).
     * 
     * @param hostPort to proxy requests through - ex. "localhost:3128"
     */
    public void setProxyHostPort(String hostPort) {
        int colonIdx = hostPort.indexOf(':');
        if (colonIdx > 0) {
            String host = hostPort.substring(0, colonIdx);
            int port = Integer.valueOf(hostPort.substring(colonIdx + 1));
            hostConfiguration.setProxy(host, port);
        }
    }

    /** 
     * 
     * @see org.archive.wayback.liveweb.LiveWebCache#shutdown()    
     */
    @Override
    public void shutdown() {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }
}