io.mandrel.requests.http.ApacheHttpRequester.java Source code

Java tutorial

Introduction

Here is the source code for io.mandrel.requests.http.ApacheHttpRequester.java

Source

/*
 * Licensed to Mandrel under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Mandrel licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package io.mandrel.requests.http;

import io.mandrel.blob.Blob;
import io.mandrel.blob.BlobMetadata;
import io.mandrel.common.data.Param;
import io.mandrel.common.data.Spider;
import io.mandrel.common.net.Uri;
import io.mandrel.common.service.TaskContext;
import io.mandrel.requests.RequestException;
import io.mandrel.requests.Requester;
import io.mandrel.requests.http.ua.FixedUserAgentProvisionner.FixedUserAgentProvisionnerDefinition;
import io.mandrel.requests.http.ua.UserAgentProvisionner;
import io.mandrel.requests.http.ua.UserAgentProvisionner.UserAgentProvisionnerDefinition;
import io.mandrel.requests.proxy.ProxyServer;

import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Semaphore;
import java.util.stream.Collectors;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;

import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;

import org.apache.commons.io.IOUtils;
import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.RequestConfig.Builder;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.MessageConstraints;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.DnsResolver;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.conn.SystemDefaultDnsResolver;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContexts;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Strings;
import com.google.common.collect.Sets;
import com.google.common.net.HttpHeaders;

@Data
@Accessors(chain = true, fluent = true)
@EqualsAndHashCode(callSuper = false)
public class ApacheHttpRequester extends Requester {

    @Data
    @Accessors(chain = false, fluent = false)
    @EqualsAndHashCode(callSuper = false)
    public static class ApacheHttpRequesterDefinition extends RequesterDefinition<ApacheHttpRequester> {

        private static final long serialVersionUID = -9205125497698919267L;

        @JsonProperty("max_line_length")
        private int maxLineLength = -1;

        @JsonProperty("max_header_count")
        private int maxHeaderCount = -1;

        @JsonProperty("io_thread_count")
        private int ioThreadCount = Runtime.getRuntime().availableProcessors();

        @JsonProperty("max_redirects")
        private int maxRedirects = 3;

        @JsonProperty("follow_redirects")
        private boolean followRedirects = true;

        @JsonProperty("headers")
        private Set<Header> headers;

        @JsonProperty("params")
        private Set<Param> params;

        @JsonProperty("cookies")
        private List<Cookie> cookies;

        @JsonProperty("user_agent_provisionner")
        private UserAgentProvisionnerDefinition userAgentProvisionner = new FixedUserAgentProvisionnerDefinition();

        @Override
        public String name() {
            return "hc";
        }

        @Override
        public ApacheHttpRequester build(TaskContext context) {
            ApacheHttpRequester apacheHttpRequester = new ApacheHttpRequester(context);
            return build(apacheHttpRequester.maxLineLength(maxLineLength).maxHeaderCount(maxHeaderCount)
                    .cookies(cookies).followRedirects(followRedirects).headers(headers).maxRedirects(maxRedirects)
                    .params(params).userAgentProvisionner(userAgentProvisionner.build(context)), context);
        }

    }

    public ApacheHttpRequester() {
        super(null);
    }

    public ApacheHttpRequester(TaskContext context) {
        super(context);
    }

    private int maxLineLength = -1;
    private int maxHeaderCount = -1;
    private int maxRedirects;
    private boolean followRedirects;
    private Set<Header> headers;
    private Set<Param> params;
    private List<Cookie> cookies;
    private UserAgentProvisionner userAgentProvisionner;

    // //////////////////////////////////////
    private CloseableHttpClient client;

    private RequestConfig defaultRequestConfig;

    private Semaphore available;

    public void close() throws IOException {
        client.close();
    }

    public void init() {

        available = new Semaphore(maxParallel(), true);

        SSLContext sslContext = SSLContexts.createSystemDefault();
        HostnameVerifier hostnameVerifier = new DefaultHostnameVerifier();

        Registry<ConnectionSocketFactory> sessionStrategyRegistry = RegistryBuilder
                .<ConnectionSocketFactory>create().register("http", PlainConnectionSocketFactory.getSocketFactory())
                .register("https", new SSLConnectionSocketFactory(sslContext, hostnameVerifier)).build();

        DnsResolver dnsResolver = new SystemDefaultDnsResolver() {
            @Override
            public InetAddress[] resolve(final String host) throws UnknownHostException {
                if (host.equalsIgnoreCase("localhost")) {
                    return new InetAddress[] { InetAddress.getByAddress(new byte[] { 127, 0, 0, 1 }) };
                } else {
                    return new InetAddress[] { nameResolver().resolve(host) };
                }
            }
        };

        // Create a connection manager with custom configuration.
        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(
                sessionStrategyRegistry, dnsResolver);

        // Create message constraints
        MessageConstraints messageConstraints = MessageConstraints.custom().setMaxHeaderCount(maxHeaderCount)
                .setMaxLineLength(maxLineLength).build();

        // Create connection configuration
        ConnectionConfig connectionConfig = ConnectionConfig.custom()
                .setMalformedInputAction(CodingErrorAction.IGNORE)
                .setUnmappableInputAction(CodingErrorAction.IGNORE).setCharset(Consts.UTF_8)
                .setMessageConstraints(messageConstraints).build();
        connManager.setDefaultConnectionConfig(connectionConfig);

        // Configure total max or per route limits for persistent connections
        // that can be kept in the pool or leased by the connection manager.
        connManager.setMaxTotal(maxPersistentConnections());
        connManager.setDefaultMaxPerRoute(maxPersistentConnections());

        // TODO
        // Use custom credentials provider if necessary.
        // CredentialsProvider credentialsProvider = new
        // BasicCredentialsProvider();

        // Create global request configuration
        defaultRequestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.DEFAULT)
                .setExpectContinueEnabled(true).setStaleConnectionCheckEnabled(true)
                .setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST))
                .setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC)).setMaxRedirects(maxRedirects())
                .setSocketTimeout(socketTimeout()).setConnectTimeout(connectTimeout())
                .setConnectionRequestTimeout(requestTimeOut()).setRedirectsEnabled(followRedirects()).build();

        // Create an HttpClient with the given custom dependencies and
        // configuration.
        client = HttpClients.custom().setConnectionManager(connManager)
                // .setDefaultCredentialsProvider(credentialsProvider)
                .setDefaultRequestConfig(defaultRequestConfig).build();
    }

    public HttpContext prepareContext(Spider spider) {
        CookieStore store = new BasicCookieStore();
        if (cookies() != null)
            cookies().forEach(cookie -> {
                BasicClientCookie theCookie = new BasicClientCookie(cookie.name(), cookie.value());
                theCookie.setDomain(cookie.domain());
                theCookie.setPath(cookie.path());
                theCookie.setExpiryDate(new Date(cookie.expires()));
                theCookie.setSecure(cookie.secure());
                store.addCookie(theCookie);
            });

        HttpContext localContext = new BasicHttpContext();
        localContext.setAttribute(HttpClientContext.COOKIE_STORE, store);
        return localContext;
    }

    public Blob get(Uri uri, Spider spider) throws Exception {
        HttpUriRequest request = prepareRequest(uri, spider);
        CloseableHttpResponse response;
        try {
            response = client.execute(request);
        } catch (ConnectTimeoutException e) {
            throw new io.mandrel.requests.ConnectTimeoutException(e);
        } catch (IOException e) {
            throw new RequestException(e);
        }
        return extractWebPage(uri, response, null);
    }

    public Blob get(Uri uri) throws Exception {
        CloseableHttpResponse response = client.execute(new HttpGet(uri.toURI()));
        return extractWebPage(uri, response, null);
    }

    public HttpUriRequest prepareRequest(Uri uri, Spider spider) {
        Builder builder = RequestConfig.copy(defaultRequestConfig);

        HttpGet request = new HttpGet(uri.toURI());

        // Add headers, cookies and ohter stuff
        if (headers() != null) {
            headers().forEach(header -> {
                if (header != null) {
                    request.addHeader(header.getName(), header.getValue());
                }
            });
        }

        HttpParams params = new BasicHttpParams();
        if (params() != null) {
            params().forEach(param -> {
                if (param != null) {
                    params.setParameter(param.getName(), param.getValue());
                }
            });
        }
        request.setParams(params);

        // Configure the user -agent
        String userAgent = userAgentProvisionner().get(uri.toString(), spider);
        if (Strings.isNullOrEmpty(userAgent)) {
            request.addHeader(HttpHeaders.USER_AGENT, userAgent);
        }

        // Configure the proxy
        ProxyServer proxy = proxyServersSource().findProxy(spider);
        if (proxy != null) {
            // TODO Auth!
            HttpHost proxyHost = new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getProtocol().getProtocol());
            builder.setProxy(proxyHost);
        }

        request.setConfig(builder.build());
        return request;
    }

    public Blob extractWebPage(Uri uri, CloseableHttpResponse result, HttpContext localContext)
            throws MalformedURLException, IOException {
        try {
            Map<String, List<String>> headers = new HashMap<String, List<String>>();
            if (result.getAllHeaders() != null) {
                for (Header header : result.getAllHeaders()) {
                    headers.put(header.getName(), Arrays.asList(header.getValue()));
                }
            }

            List<io.mandrel.requests.http.Cookie> cookies = null;
            if (localContext != null) {
                CookieStore store = (CookieStore) localContext.getAttribute(HttpClientContext.COOKIE_STORE);
                if (store.getCookies() != null) {
                    cookies = store.getCookies().stream().filter(cookie -> cookie != null)
                            .map(cookie -> new io.mandrel.requests.http.Cookie(cookie.getName(), cookie.getValue(),
                                    cookie.getDomain(), cookie.getPath(),
                                    cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() : 0,
                                    cookie.getExpiryDate() != null ? (int) cookie.getExpiryDate().getTime() : 0,
                                    cookie.isSecure(), false))
                            .collect(Collectors.toList());
                }
            }

            HttpFetchMetadata metadata = new HttpFetchMetadata().headers(headers).cookies(cookies);
            metadata.setUri(uri)
                    .setStatusCode(result.getStatusLine() != null ? result.getStatusLine().getStatusCode() : 0)
                    .setStatusText(
                            result.getStatusLine() != null ? result.getStatusLine().getReasonPhrase() : null);

            HttpEntity entity = result.getEntity();
            InputStream content = entity.getContent();
            try {
                long contentLength = entity.getContentLength();
                Blob blob = new Blob(new BlobMetadata().setUri(uri)
                        .setSize(contentLength < 0 ? null : contentLength).setFetchMetadata(metadata))
                                .payload(IOUtils.toByteArray(content));
                return blob;
            } catch (IOException ex) {
                // In case of an IOException the connection will be released
                // back to the connection manager automatically
                throw ex;
            } finally {
                // Closing the input stream will trigger connection release
                content.close();
            }
        } finally {
            result.close();
        }
    }

    @Override
    public Set<String> getProtocols() {
        return Sets.newHashSet("http", "https");
    }

    @Override
    public boolean check() {
        // TODO
        return true;
    }
}