Java tutorial
/* * Copyright 2015 Crosstree Labs. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.crosstreelabs.cognitio.gumshoe.transport; import com.crosstreelabs.cognitio.api.extension.TransportHandler; import com.crosstreelabs.cognitio.api.resource.Host; import com.crosstreelabs.cognitio.api.resource.Status; import com.crosstreelabs.cognitio.api.resource.Visit; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.net.URL; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.X509Certificate; import java.util.Timer; import java.util.TimerTask; import javax.net.ssl.SSLContext; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang3.StringUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLContexts; import org.apache.http.conn.ssl.TrustStrategy; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HttpTransport implements TransportHandler { private static final Logger LOGGER = LoggerFactory.getLogger(HttpTransport.class); private HttpClient httpClient; private PoolingHttpClientConnectionManager connectionManager; private RequestConfig requestConfig; public HttpTransport() { buildHttpClient(); } @Override public boolean handles(final Visit visit) { if (visit == null || visit.catalogueEntry == null || StringUtils.isBlank(visit.catalogueEntry.location)) { return false; } return visit.catalogueEntry.location.startsWith("http://") || visit.catalogueEntry.location.startsWith("https://"); } @Override public void handle(final Visit visit) throws IOException { if (!handles(visit)) { throw new UnsupportedOperationException("Resource not supported"); } if (visit.result.host == null) { visit.result.host = new Host(); visit.result.host.host = URI.create(visit.result.location).getHost(); } visit.result.host.lastRequest = new DateTime(); final HttpGet request = new HttpGet(visit.result.location); request.setConfig(requestConfig); request.addHeader("Connection", "close"); HttpResponse response = httpClient.execute(request); Header location; try { switch (response.getStatusLine().getStatusCode()) { case HttpStatus.SC_MOVED_PERMANENTLY: case 308: location = response.getFirstHeader("Location"); if (location != null) { String movedToUrl = URI.create(visit.result.location).resolve(location.getValue()).toString(); // resource.relocated = Relocated.PERMANENTLY; // resource.newLocation = movedToUrl; } visit.catalogueEntry.status = Status.DISABLED; break; case HttpStatus.SC_MOVED_TEMPORARILY: case HttpStatus.SC_MULTIPLE_CHOICES: case HttpStatus.SC_SEE_OTHER: case HttpStatus.SC_TEMPORARY_REDIRECT: location = response.getFirstHeader("Location"); if (location != null) { String movedToUrl = URI.create(visit.result.location).resolve(location.getValue()).toString(); // resource.relocated = Relocated.TEMPORARILY; // resource.newLocation = movedToUrl; } visit.catalogueEntry.status = Status.INDEXED; break; case HttpStatus.SC_OK: visit.fetchedUrl = visit.result.location; String uri = request.getURI().toString(); if (!uri.equals(visit.result.location)) { if (!URI.create(uri).toString().equals(visit.result.location)) { visit.fetchedUrl = uri; } } ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copy(response.getEntity().getContent(), baos); visit.contentStream = new ByteArrayInputStream(baos.toByteArray()); Header contentType = response.getFirstHeader("Content-Type"); if (contentType != null) { visit.contentType = contentType.toString(); } Header contentEncoding = response.getFirstHeader("Content-Encoding"); if (contentEncoding != null) { visit.contentEncoding = contentEncoding.toString(); } Header contentCharset = response.getFirstHeader("Content-Charset"); if (contentCharset != null) { visit.contentCharset = contentCharset.toString(); } break; default: visit.catalogueEntry.status = Status.FAILED; visit.catalogueEntry.status_reason = "Unexpected status code " + visit.statusCode; } } finally { request.abort(); } } private void buildHttpClient() { requestConfig = RequestConfig.custom().setExpectContinueEnabled(false).setCookieSpec(CookieSpecs.DEFAULT) .setRedirectsEnabled(false).setSocketTimeout(5000).setConnectTimeout(5000) .setConnectionRequestTimeout(5000).setStaleConnectionCheckEnabled(true).build(); RegistryBuilder<ConnectionSocketFactory> connRegistryBuilder = RegistryBuilder.create(); connRegistryBuilder.register("http", PlainConnectionSocketFactory.INSTANCE); try { // Fixing: https://code.google.com/p/crawler4j/issues/detail?id=174 // By always trusting the ssl certificate SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustStrategy() { @Override public boolean isTrusted(final X509Certificate[] chain, String authType) { return true; } }).build(); SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); connRegistryBuilder.register("https", sslsf); } catch (KeyManagementException | KeyStoreException | NoSuchAlgorithmException e) { LOGGER.warn("Exception thrown while trying to register https"); LOGGER.debug("Stacktrace", e); } Registry<ConnectionSocketFactory> connRegistry = connRegistryBuilder.build(); connectionManager = new PoolingHttpClientConnectionManager(connRegistry); connectionManager.setMaxTotal(5); connectionManager.setDefaultMaxPerRoute(5); HttpClientBuilder clientBuilder = HttpClientBuilder.create(); clientBuilder.setDefaultRequestConfig(requestConfig); clientBuilder.setConnectionManager(connectionManager); clientBuilder.setUserAgent("Cognitio"); httpClient = clientBuilder.build(); } }