com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.protocol.httpclient;

import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.http.Header;
import org.apache.http.HeaderIterator;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.RequestConfig.Builder;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.Args;
import org.apache.http.util.ByteArrayBuffer;
import org.apache.storm.Config;
import org.slf4j.LoggerFactory;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.util.ConfUtils;

import crawlercommons.robots.BaseRobotRules;

/**
 * Uses Apache httpclient to handle http and https
 **/

public class HttpProtocol extends AbstractHttpProtocol implements ResponseHandler<ProtocolResponse> {

    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(HttpProtocol.class);

    private final static PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager();

    static {
        // Increase max total connection to 200
        CONNECTION_MANAGER.setMaxTotal(200);
        // Increase default max connection per route to 20
        CONNECTION_MANAGER.setDefaultMaxPerRoute(20);
    }

    private int maxContent;

    private HttpClientBuilder builder;

    private RequestConfig requestConfig;

    @Override
    public void configure(final Config conf) {

        super.configure(conf);

        this.maxContent = ConfUtils.getInt(conf, "http.content.limit", -1);

        String userAgent = getAgentString(ConfUtils.getString(conf, "http.agent.name"),
                ConfUtils.getString(conf, "http.agent.version"),
                ConfUtils.getString(conf, "http.agent.description"), ConfUtils.getString(conf, "http.agent.url"),
                ConfUtils.getString(conf, "http.agent.email"));

        builder = HttpClients.custom().setUserAgent(userAgent).setConnectionManager(CONNECTION_MANAGER)
                .setConnectionManagerShared(true).disableRedirectHandling().disableAutomaticRetries();

        String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null);
        int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080);

        boolean useProxy = proxyHost != null && proxyHost.length() > 0;

        // use a proxy?
        if (useProxy) {
            HttpHost proxy = new HttpHost(proxyHost, proxyPort);
            DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
            builder.setRoutePlanner(routePlanner);
        }

        int timeout = ConfUtils.getInt(conf, "http.timeout", 10000);

        Builder requestConfigBuilder = RequestConfig.custom();
        requestConfigBuilder.setSocketTimeout(timeout);
        requestConfigBuilder.setConnectTimeout(timeout);
        requestConfigBuilder.setConnectionRequestTimeout(timeout);
        requestConfigBuilder.setCookieSpec(CookieSpecs.STANDARD);
        requestConfig = requestConfigBuilder.build();
    }

    @Override
    public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Exception {

        LOG.debug("HTTP connection manager stats {}", CONNECTION_MANAGER.getTotalStats());

        HttpGet httpget = new HttpGet(url);
        httpget.setConfig(requestConfig);

        if (md != null) {
            String lastModified = md.getFirstValue("last-modified");
            if (StringUtils.isNotBlank(lastModified)) {
                httpget.addHeader("If-Modified-Since", lastModified);
            }

            String ifNoneMatch = md.getFirstValue("etag");
            if (StringUtils.isNotBlank(ifNoneMatch)) {
                httpget.addHeader("If-None-Match", ifNoneMatch);
            }
        }

        // no need to release the connection explicitly as this is handled
        // automatically. The client itself must be closed though.
        try (CloseableHttpClient httpclient = builder.build()) {
            return httpclient.execute(httpget, this);
        }
    }

    @Override
    public ProtocolResponse handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
        int status = response.getStatusLine().getStatusCode();
        Metadata metadata = new Metadata();
        HeaderIterator iter = response.headerIterator();
        while (iter.hasNext()) {
            Header header = iter.nextHeader();
            metadata.addValue(header.getName().toLowerCase(Locale.ROOT), header.getValue());
        }

        MutableBoolean trimmed = new MutableBoolean();

        byte[] bytes = HttpProtocol.toByteArray(response.getEntity(), maxContent, trimmed);

        if (trimmed.booleanValue()) {
            metadata.setValue("http.trimmed", "true");
            LOG.warn("HTTP content trimmed to {}", bytes.length);
        }

        return new ProtocolResponse(bytes, status, metadata);
    }

    private static final byte[] toByteArray(final HttpEntity entity, int maxContent, MutableBoolean trimmed)
            throws IOException {

        if (entity == null)
            return new byte[] {};

        final InputStream instream = entity.getContent();
        if (instream == null) {
            return null;
        }
        try {
            Args.check(entity.getContentLength() <= Integer.MAX_VALUE,
                    "HTTP entity too large to be buffered in memory");
            int i = (int) entity.getContentLength();
            if (i < 0) {
                i = 4096;
            }
            final ByteArrayBuffer buffer = new ByteArrayBuffer(i);
            final byte[] tmp = new byte[4096];
            int l;
            int total = 0;
            while ((l = instream.read(tmp)) != -1) {
                // check whether we need to trim
                if (maxContent != -1 && total + l > maxContent) {
                    buffer.append(tmp, 0, maxContent - total);
                    trimmed.setValue(true);
                    break;
                }
                buffer.append(tmp, 0, l);
                total += l;
            }
            return buffer.toByteArray();
        } finally {
            instream.close();
        }
    }

    public static void main(String args[]) throws Exception {
        HttpProtocol protocol = new HttpProtocol();
        Config conf = new Config();

        String url = args[0];
        ConfUtils.loadConf(args[1], conf);
        protocol.configure(conf);

        if (!protocol.skipRobots) {
            BaseRobotRules rules = protocol.getRobotRules(url);
            System.out.println("is allowed : " + rules.isAllowed(url));
        }

        Metadata md = new Metadata();
        ProtocolResponse response = protocol.getProtocolOutput(url, md);
        System.out.println(url);
        System.out.println(response.getMetadata());
        System.out.println(response.getStatusCode());
        System.out.println(response.getContent().length);
    }

}