com.crosstreelabs.cognitio.gumshoe.transport.HttpTransport.java Source code

Java tutorial

Introduction

Here is the source code for com.crosstreelabs.cognitio.gumshoe.transport.HttpTransport.java

Source

/*
 * Copyright 2015 Crosstree Labs.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.crosstreelabs.cognitio.gumshoe.transport;

import com.crosstreelabs.cognitio.api.extension.TransportHandler;
import com.crosstreelabs.cognitio.api.resource.Host;
import com.crosstreelabs.cognitio.api.resource.Status;
import com.crosstreelabs.cognitio.api.resource.Visit;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.Timer;
import java.util.TimerTask;
import javax.net.ssl.SSLContext;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HttpTransport implements TransportHandler {

    private static final Logger LOGGER = LoggerFactory.getLogger(HttpTransport.class);

    private HttpClient httpClient;
    private PoolingHttpClientConnectionManager connectionManager;
    private RequestConfig requestConfig;

    public HttpTransport() {
        buildHttpClient();
    }

    @Override
    public boolean handles(final Visit visit) {
        if (visit == null || visit.catalogueEntry == null || StringUtils.isBlank(visit.catalogueEntry.location)) {
            return false;
        }
        return visit.catalogueEntry.location.startsWith("http://")
                || visit.catalogueEntry.location.startsWith("https://");
    }

    @Override
    public void handle(final Visit visit) throws IOException {
        if (!handles(visit)) {
            throw new UnsupportedOperationException("Resource not supported");
        }

        if (visit.result.host == null) {
            visit.result.host = new Host();
            visit.result.host.host = URI.create(visit.result.location).getHost();
        }
        visit.result.host.lastRequest = new DateTime();

        final HttpGet request = new HttpGet(visit.result.location);
        request.setConfig(requestConfig);
        request.addHeader("Connection", "close");
        HttpResponse response = httpClient.execute(request);
        Header location;

        try {
            switch (response.getStatusLine().getStatusCode()) {
            case HttpStatus.SC_MOVED_PERMANENTLY:
            case 308:
                location = response.getFirstHeader("Location");
                if (location != null) {
                    String movedToUrl = URI.create(visit.result.location).resolve(location.getValue()).toString();
                    //                    resource.relocated = Relocated.PERMANENTLY;
                    //                    resource.newLocation = movedToUrl;
                }
                visit.catalogueEntry.status = Status.DISABLED;
                break;
            case HttpStatus.SC_MOVED_TEMPORARILY:
            case HttpStatus.SC_MULTIPLE_CHOICES:
            case HttpStatus.SC_SEE_OTHER:
            case HttpStatus.SC_TEMPORARY_REDIRECT:
                location = response.getFirstHeader("Location");
                if (location != null) {
                    String movedToUrl = URI.create(visit.result.location).resolve(location.getValue()).toString();
                    //                    resource.relocated = Relocated.TEMPORARILY;
                    //                    resource.newLocation = movedToUrl;
                }
                visit.catalogueEntry.status = Status.INDEXED;
                break;
            case HttpStatus.SC_OK:
                visit.fetchedUrl = visit.result.location;
                String uri = request.getURI().toString();
                if (!uri.equals(visit.result.location)) {
                    if (!URI.create(uri).toString().equals(visit.result.location)) {
                        visit.fetchedUrl = uri;
                    }
                }

                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                IOUtils.copy(response.getEntity().getContent(), baos);
                visit.contentStream = new ByteArrayInputStream(baos.toByteArray());
                Header contentType = response.getFirstHeader("Content-Type");
                if (contentType != null) {
                    visit.contentType = contentType.toString();
                }
                Header contentEncoding = response.getFirstHeader("Content-Encoding");
                if (contentEncoding != null) {
                    visit.contentEncoding = contentEncoding.toString();
                }
                Header contentCharset = response.getFirstHeader("Content-Charset");
                if (contentCharset != null) {
                    visit.contentCharset = contentCharset.toString();
                }
                break;
            default:
                visit.catalogueEntry.status = Status.FAILED;
                visit.catalogueEntry.status_reason = "Unexpected status code " + visit.statusCode;
            }
        } finally {
            request.abort();
        }
    }

    private void buildHttpClient() {
        requestConfig = RequestConfig.custom().setExpectContinueEnabled(false).setCookieSpec(CookieSpecs.DEFAULT)
                .setRedirectsEnabled(false).setSocketTimeout(5000).setConnectTimeout(5000)
                .setConnectionRequestTimeout(5000).setStaleConnectionCheckEnabled(true).build();

        RegistryBuilder<ConnectionSocketFactory> connRegistryBuilder = RegistryBuilder.create();
        connRegistryBuilder.register("http", PlainConnectionSocketFactory.INSTANCE);
        try { // Fixing: https://code.google.com/p/crawler4j/issues/detail?id=174
              // By always trusting the ssl certificate
            SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustStrategy() {
                @Override
                public boolean isTrusted(final X509Certificate[] chain, String authType) {
                    return true;
                }
            }).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext,
                    SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            connRegistryBuilder.register("https", sslsf);
        } catch (KeyManagementException | KeyStoreException | NoSuchAlgorithmException e) {
            LOGGER.warn("Exception thrown while trying to register https");
            LOGGER.debug("Stacktrace", e);
        }

        Registry<ConnectionSocketFactory> connRegistry = connRegistryBuilder.build();
        connectionManager = new PoolingHttpClientConnectionManager(connRegistry);
        connectionManager.setMaxTotal(5);
        connectionManager.setDefaultMaxPerRoute(5);

        HttpClientBuilder clientBuilder = HttpClientBuilder.create();
        clientBuilder.setDefaultRequestConfig(requestConfig);
        clientBuilder.setConnectionManager(connectionManager);
        clientBuilder.setUserAgent("Cognitio");

        httpClient = clientBuilder.build();
    }
}