Java tutorial
/* * Copyright 2014 Bibliotheca Alexandrina. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.liveweb; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.ConnectException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.util.logging.Logger; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NoHttpResponseException; import org.apache.commons.httpclient.params.HttpClientParams; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.resourcefile.ArcResource; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** * This class fetches resource from live web. * It works with standard proxy server e.g. Squid. * * @author Mohamed Elsayed * @see LiveWebCache * @see ArcRemoteLiveWebCache */ public class StdRemoteLiveWebCache implements LiveWebCache { private static final Logger LOGGER = Logger.getLogger(StdRemoteLiveWebCache.class.getName()); protected MultiThreadedHttpConnectionManager connectionManager; protected HostConfiguration hostConfiguration; protected HttpClient httpClient; protected String requestPrefix; private CloseableHttpResponse response; private ArcResource ar; /** * StdRemoteLiveWebCache constructor initializes and configures connection objects. */ public StdRemoteLiveWebCache() { connectionManager = new MultiThreadedHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler()); httpClient = new HttpClient(params, connectionManager); httpClient.setHostConfiguration(hostConfiguration); } /** * Gets resource object from the live web. Configure timeout to 10 seconds. * * @param url to fetch from the live web. * @param maxCacheMS maximum age of resource to return - optionally honored * @param bUseOlder if true, return documents older than maxCacheMS if * a more recent copy is not available. * * @return Resource for url * * @throws LiveDocumentNotAvailableException if the resource cannot be * retrieved from the live web, but all proxying and caching * mechanisms functioned properly * @throws LiveWebCacheUnavailableException if there was a problem either * accessing the live web, in proxying to the live web, or in * maintaining the cache for the live web * @throws LiveWebTimeoutException if there is no response from the live * web cache before a timeout occurred. * @throws IOException for the usual reasons * * @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean) * @inheritDoc org.archive.wayback.liveweb.LiveWebCache#getCachedResource */ @Override public Resource getCachedResource(URL url, long maxCacheMS, boolean bUseOlder) throws LiveDocumentNotAvailableException, LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException { String urlStr = url.toExternalForm(); if (requestPrefix != null) urlStr = requestPrefix + urlStr; HttpHost proxy = new HttpHost(hostConfiguration.getProxyHost(), hostConfiguration.getProxyPort()); // Set socketTimeout and connectionTimeout to 10 seconds. RequestConfig reqConf = RequestConfig.custom().setProxy(proxy).setSocketTimeout(10000) .setConnectTimeout(10000).setConnectionRequestTimeout(10000).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(reqConf).build(); HttpGet httpGet = new HttpGet(urlStr); try { // The following line gets robots.txt from live web response = httpclient.execute(httpGet); String httpHeaderStr = ""; String bodyStr = ""; /* If it fails to get robots.txt (http status code is 404), then display contents and don't throw exception (socketTimeOutException or connectTimeOutException) */ if (response.getStatusLine().getStatusCode() == 404) { httpHeaderStr = "HTTP/1.0 200 OK\n"; bodyStr = String.format("%s\n%s\n", "User-agent: *", "Allow: /"); } else if (response.getStatusLine().getStatusCode() == 200) { // The following line represents first line in http header httpHeaderStr = String.format("%s %d %s\n", response.getStatusLine().getProtocolVersion(), response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase()); // Get robots.txt contents and store it into bodyStr HttpEntity entity = response.getEntity(); bodyStr = EntityUtils.toString(entity); } // Get Http Header and store complete http header in httpHeaderStr for (Header header : response.getAllHeaders()) httpHeaderStr += header.toString() + "\n"; httpHeaderStr += "\n"; int length = httpHeaderStr.length() + bodyStr.length(); /* Using httpHeaderStr and bodyStr to construct responseStr. First line in responseStr should exist. */ // TODO: the following line should be enhanced, // especially the first line in responseStr. String responseStr = String.format("%s %s %d\n%s%s", urlStr, "0.0.0.0 10000000000000 text/plain", length, httpHeaderStr, bodyStr); ByteArrayInputStream bais = new ByteArrayInputStream(responseStr.getBytes()); // TODO: Should not use ARCRecord ARCRecord r = new ARCRecord(bais, "id", 0L, false, false, true); ar = (ArcResource) ResourceFactory.ARCArchiveRecordToResource(r, null); if (ar.getStatusCode() == 502) { throw new LiveDocumentNotAvailableException(urlStr); } else if (ar.getStatusCode() == 504) { throw new LiveWebTimeoutException("Timeout:" + urlStr); } return ar; } catch (ResourceNotAvailableException e) { throw new LiveDocumentNotAvailableException(urlStr); } catch (NoHttpResponseException e) { throw new LiveWebCacheUnavailableException("No Http Response for " + urlStr); } catch (ConnectException e) { throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlStr); } catch (SocketException e) { throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlStr); } catch (SocketTimeoutException e) { throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlStr); } catch (ConnectTimeoutException e) { throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlStr); } finally { response.close(); } } /** * Sets proxy and port (proxy:port). * * @param hostPort to proxy requests through - ex. "localhost:3128" */ public void setProxyHostPort(String hostPort) { int colonIdx = hostPort.indexOf(':'); if (colonIdx > 0) { String host = hostPort.substring(0, colonIdx); int port = Integer.valueOf(hostPort.substring(colonIdx + 1)); hostConfiguration.setProxy(host, port); } } /** * * @see org.archive.wayback.liveweb.LiveWebCache#shutdown() */ @Override public void shutdown() { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } }