Java tutorial
/* * Tanaguru - Automated webpage assessment * Copyright (C) 2008-2015 Tanaguru.org * * This file is part of Tanaguru. * * Tanaguru is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: tanaguru AT tanaguru DOT org */ package org.tanaguru.util.http; import java.io.IOException; import java.net.URISyntaxException; import java.net.UnknownHostException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.URIUtil; import org.apache.commons.lang3.StringUtils; import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpHead; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; /** * * @author jkowalczyk */ public class HttpRequestHandler { private static final String TANAGURU_USER_AGENT = "tanaguru"; private static final Logger LOGGER = Logger.getLogger(HttpRequestHandler.class); private String proxyPort; public void setProxyPort(String proxyPort) { this.proxyPort = proxyPort; } private String proxyHost; public void setProxyHost(String proxyHost) { this.proxyHost = proxyHost; } private String proxyUser; public void setProxyUser(String proxyUser) { this.proxyUser = proxyUser; } private String proxyPassword; public void setProxyPassword(String proxyPassword) { this.proxyPassword = proxyPassword; } private boolean bypassCheck = false; public void setBypassCheck(String bypassCheck) { this.bypassCheck = Boolean.valueOf(bypassCheck); } /** * Multiple Url can be set through a unique String separated by ; */ private final List<String> proxyExclusionUrlList = new ArrayList<>(); public List<String> getProxyExclusionUrlList() { return proxyExclusionUrlList; } public void setProxyExclusionUrl(String proxyExclusionUrl) { if (StringUtils.isNotBlank(proxyExclusionUrl.trim())) { proxyExclusionUrlList.addAll(Arrays.asList(proxyExclusionUrl.split(";"))); } } private int connectionTimeout = 3000; public void setConnectionTimeout(int connectionTimeout) { this.connectionTimeout = connectionTimeout; } private int socketTimeout = 3000; public void setSocketTimeout(int socketTimeout) { this.socketTimeout = socketTimeout; } /** * The holder that handles the unique instance of HttpRequestHandler */ private static class HttpRequestHandlerHolder { private static final HttpRequestHandler INSTANCE = new HttpRequestHandler(); } /** * Private constructor */ private HttpRequestHandler() { } /** * Singleton pattern based on the "Initialization-on-demand * holder idiom". See @http://en.wikipedia.org/wiki/Initialization_on_demand_holder_idiom * @return the unique instance of HttpRequestHandler */ public static HttpRequestHandler getInstance() { return HttpRequestHandlerHolder.INSTANCE; } /** * * @param url * @return whether the given Url is accessible or not */ public boolean isUrlAccessible(String url) { if (bypassCheck) { LOGGER.debug("check on Url is bypassed by configuration"); return true; } try { int statusFromHead = computeStatus(getHttpStatus(url)); switch (statusFromHead) { case 1: return true; case 0: int statusFromGet = computeStatus(getHttpStatusFromGet(url)); switch (statusFromGet) { case 0: return false; case 1: return true; } } return false; } catch (IOException ex) { LOGGER.debug(ex.getMessage()); LOGGER.debug("IOException on " + url); return false; } } public int getHttpStatus(String url) throws IOException { String encodedUrl = getEncodedUrl(url); CloseableHttpClient httpClient = getHttpClient(encodedUrl); HttpHead head = new HttpHead(encodedUrl); try { LOGGER.debug("executing head request to retrieve page status on " + head.getURI()); HttpResponse response = httpClient.execute(head); if (LOGGER.isDebugEnabled()) { LOGGER.debug("received " + response.getStatusLine().getStatusCode() + " from head request"); for (Header h : head.getAllHeaders()) { LOGGER.debug("header : " + h.getName() + " " + h.getValue()); } } return response.getStatusLine().getStatusCode(); } catch (UnknownHostException uhe) { LOGGER.warn("UnknownHostException on " + encodedUrl); return HttpStatus.SC_NOT_FOUND; } catch (IllegalArgumentException iae) { LOGGER.warn("IllegalArgumentException on " + encodedUrl); return HttpStatus.SC_NOT_FOUND; } catch (IOException ioe) { LOGGER.warn("IOException on " + encodedUrl); return HttpStatus.SC_NOT_FOUND; } finally { // When HttpClient instance is no longer needed, // shut down the connection manager to ensure // immediate deallocation of all system resources head.releaseConnection(); httpClient.close(); } } public String getHttpContent(String url) throws URISyntaxException, UnknownHostException, IOException, IllegalCharsetNameException { if (StringUtils.isEmpty(url)) { return ""; } String encodedUrl = getEncodedUrl(url); CloseableHttpClient httpClient = getHttpClient(encodedUrl); HttpGet get = new HttpGet(encodedUrl); try { LOGGER.debug("executing request to retrieve content on " + get.getURI()); HttpResponse response = httpClient.execute(get); LOGGER.debug("received " + response.getStatusLine().getStatusCode() + " from get request"); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { LOGGER.debug("status == HttpStatus.SC_OK "); return EntityUtils.toString(response.getEntity(), Charset.defaultCharset()); } else { LOGGER.debug("status != HttpStatus.SC_OK "); return ""; } } catch (NullPointerException ioe) { LOGGER.debug("NullPointerException"); return ""; } finally { // When HttpClient instance is no longer needed, // shut down the connection manager to ensure // immediate deallocation of all system resources get.releaseConnection(); LOGGER.debug("finally"); httpClient.close(); } } public int getHttpStatusFromGet(String url) throws IOException { String encodedUrl = getEncodedUrl(url); CloseableHttpClient httpClient = getHttpClient(encodedUrl); HttpGet get = new HttpGet(encodedUrl); try { LOGGER.debug("executing get request to retrieve status on " + get.getURI()); HttpResponse status = httpClient.execute(get); if (LOGGER.isDebugEnabled()) { LOGGER.debug("received " + status + " from get request"); for (Header h : get.getAllHeaders()) { LOGGER.debug("header : " + h.getName() + " " + h.getValue()); } } return status.getStatusLine().getStatusCode(); } catch (UnknownHostException uhe) { LOGGER.warn("UnknownHostException on " + encodedUrl); return HttpStatus.SC_NOT_FOUND; } catch (IOException ioe) { LOGGER.warn("IOException on " + encodedUrl); return HttpStatus.SC_NOT_FOUND; } finally { // When HttpClient instance is no longer needed, // shut down the connection manager to ensure // immediate deallocation of all system resources get.releaseConnection(); httpClient.close(); } } private CloseableHttpClient getHttpClient(String url) { RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(socketTimeout) .setConnectTimeout(connectionTimeout).build(); HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); httpClientBuilder.setDefaultRequestConfig(requestConfig); httpClientBuilder.setConnectionManager(new PoolingHttpClientConnectionManager()); httpClientBuilder.setUserAgent(TANAGURU_USER_AGENT); if (isProxySet(url)) { LOGGER.debug(("Set proxy with " + proxyHost + " and " + proxyPort)); httpClientBuilder.setProxy(new HttpHost(proxyHost, Integer.valueOf(proxyPort))); if (isProxyCredentialSet()) { CredentialsProvider credsProvider = new BasicCredentialsProvider(); credsProvider.setCredentials(new AuthScope(proxyHost, Integer.valueOf(proxyPort)), new UsernamePasswordCredentials(proxyUser, proxyPassword)); httpClientBuilder.setDefaultCredentialsProvider(credsProvider); LOGGER.debug(("Set proxy credentials " + proxyHost + " and " + proxyPort + " and " + proxyUser + " and " + proxyPassword)); } } return httpClientBuilder.build(); } /** * * @param url * @return */ public boolean isProxySet(String url) { for (String excludedUrl : proxyExclusionUrlList) { if (url.contains(excludedUrl) && StringUtils.isNotBlank(excludedUrl)) { LOGGER.debug("Proxy Not Set due to exclusion with : " + excludedUrl); return false; } } LOGGER.debug("isProxySet: " + (StringUtils.isNotEmpty(proxyHost) && StringUtils.isNotEmpty(proxyPort))); return StringUtils.isNotEmpty(proxyHost) && StringUtils.isNotEmpty(proxyPort); } /** * * @param url * @return */ private boolean isProxyCredentialSet() { LOGGER.debug("isProxyCredentialSet" + (StringUtils.isNotEmpty(proxyUser) && StringUtils.isNotEmpty(proxyPassword))); return StringUtils.isNotEmpty(proxyUser) && StringUtils.isNotEmpty(proxyPassword); } private int computeStatus(int status) { switch (status) { case HttpStatus.SC_FORBIDDEN: case HttpStatus.SC_METHOD_NOT_ALLOWED: case HttpStatus.SC_BAD_REQUEST: case HttpStatus.SC_UNAUTHORIZED: case HttpStatus.SC_PAYMENT_REQUIRED: case HttpStatus.SC_NOT_FOUND: case HttpStatus.SC_NOT_ACCEPTABLE: case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED: case HttpStatus.SC_REQUEST_TIMEOUT: case HttpStatus.SC_CONFLICT: case HttpStatus.SC_GONE: case HttpStatus.SC_LENGTH_REQUIRED: case HttpStatus.SC_PRECONDITION_FAILED: case HttpStatus.SC_REQUEST_TOO_LONG: case HttpStatus.SC_REQUEST_URI_TOO_LONG: case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE: case HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE: case HttpStatus.SC_EXPECTATION_FAILED: case HttpStatus.SC_INSUFFICIENT_SPACE_ON_RESOURCE: case HttpStatus.SC_METHOD_FAILURE: case HttpStatus.SC_UNPROCESSABLE_ENTITY: case HttpStatus.SC_LOCKED: case HttpStatus.SC_FAILED_DEPENDENCY: case HttpStatus.SC_INTERNAL_SERVER_ERROR: case HttpStatus.SC_NOT_IMPLEMENTED: case HttpStatus.SC_BAD_GATEWAY: case HttpStatus.SC_SERVICE_UNAVAILABLE: case HttpStatus.SC_GATEWAY_TIMEOUT: case HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED: case HttpStatus.SC_INSUFFICIENT_STORAGE: return 0; case HttpStatus.SC_CONTINUE: case HttpStatus.SC_SWITCHING_PROTOCOLS: case HttpStatus.SC_PROCESSING: case HttpStatus.SC_OK: case HttpStatus.SC_CREATED: case HttpStatus.SC_ACCEPTED: case HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION: case HttpStatus.SC_NO_CONTENT: case HttpStatus.SC_RESET_CONTENT: case HttpStatus.SC_PARTIAL_CONTENT: case HttpStatus.SC_MULTI_STATUS: case HttpStatus.SC_MULTIPLE_CHOICES: case HttpStatus.SC_MOVED_PERMANENTLY: case HttpStatus.SC_MOVED_TEMPORARILY: case HttpStatus.SC_SEE_OTHER: case HttpStatus.SC_NOT_MODIFIED: case HttpStatus.SC_USE_PROXY: case HttpStatus.SC_TEMPORARY_REDIRECT: return 1; default: return 1; } } private String getEncodedUrl(String url) { try { return URIUtil.encodeQuery(URIUtil.decode(url)); } catch (URIException ue) { LOGGER.warn("URIException on " + url); return url; } } }