com.digitalpebble.storm.crawler.protocol.httpclient.HttpProtocol.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.storm.crawler.protocol.httpclient.HttpProtocol.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.protocol.httpclient;

import java.io.IOException;
import java.util.Locale;

import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HeaderIterator;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.slf4j.LoggerFactory;

import backtype.storm.Config;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.protocol.AbstractHttpProtocol;
import com.digitalpebble.storm.crawler.protocol.HttpRobotRulesParser;
import com.digitalpebble.storm.crawler.protocol.ProtocolResponse;
import com.digitalpebble.storm.crawler.protocol.RobotRulesParser;
import com.digitalpebble.storm.crawler.util.ConfUtils;

import crawlercommons.robots.BaseRobotRules;

/**
 * Uses Apache httpclient to handle http and https
 **/

public class HttpProtocol extends AbstractHttpProtocol implements ResponseHandler<ProtocolResponse> {

    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(HttpProtocol.class);

    private final static PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager();
    static {
        // Increase max total connection to 200
        CONNECTION_MANAGER.setMaxTotal(200);
        // Increase default max connection per route to 20
        CONNECTION_MANAGER.setDefaultMaxPerRoute(20);
    }

    private com.digitalpebble.storm.crawler.protocol.HttpRobotRulesParser robots;

    /**
     * TODO record response time in the meta data, see property
     * http.store.responsetime.
     */
    private boolean responseTime = true;

    // TODO find way of limiting the content fetched
    private int maxContent;

    private boolean skipRobots = false;

    private HttpClientBuilder builder;

    private RequestConfig requestConfig;

    @Override
    public void configure(final Config conf) {
        this.maxContent = ConfUtils.getInt(conf, "http.content.limit", 64 * 1024);
        String userAgent = getAgentString(ConfUtils.getString(conf, "http.agent.name"),
                ConfUtils.getString(conf, "http.agent.version"),
                ConfUtils.getString(conf, "http.agent.description"), ConfUtils.getString(conf, "http.agent.url"),
                ConfUtils.getString(conf, "http.agent.email"));

        this.responseTime = ConfUtils.getBoolean(conf, "http.store.responsetime", true);

        this.skipRobots = ConfUtils.getBoolean(conf, "http.skip.robots", false);

        robots = new HttpRobotRulesParser(conf);

        builder = HttpClients.custom().setUserAgent(userAgent).setConnectionManager(CONNECTION_MANAGER)
                .setConnectionManagerShared(true).disableRedirectHandling();

        String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null);
        int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080);

        boolean useProxy = (proxyHost != null && proxyHost.length() > 0);

        // use a proxy?
        if (useProxy) {
            HttpHost proxy = new HttpHost(proxyHost, proxyPort);
            DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
            builder.setRoutePlanner(routePlanner);
        }

        int timeout = ConfUtils.getInt(conf, "http.timeout", 10000);
        requestConfig = RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build();
    }

    @Override
    public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Exception {

        LOG.debug("HTTP connection manager stats {}", CONNECTION_MANAGER.getTotalStats());

        HttpGet httpget = new HttpGet(url);
        httpget.setConfig(requestConfig);

        if (md != null) {
            String ifModifiedSince = md.getFirstValue("cachedLastModified");
            if (StringUtils.isNotBlank(ifModifiedSince)) {
                httpget.addHeader("If-Modified-Since", ifModifiedSince);
            }

            String ifNoneMatch = md.getFirstValue("cachedEtag");
            if (StringUtils.isNotBlank(ifNoneMatch)) {
                httpget.addHeader("If-None-Match", ifNoneMatch);
            }
        }

        // no need to release the connection explicitly as this is handled
        // automatically. The client itself must be closed though.
        try (CloseableHttpClient httpclient = builder.build()) {
            return httpclient.execute(httpget, this);
        }
    }

    @Override
    public ProtocolResponse handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
        int status = response.getStatusLine().getStatusCode();
        Metadata metadata = new Metadata();
        HeaderIterator iter = response.headerIterator();
        while (iter.hasNext()) {
            Header header = iter.nextHeader();
            metadata.addValue(header.getName().toLowerCase(Locale.ROOT), header.getValue());
        }
        // TODO find a way of limiting by maxContent
        byte[] bytes = EntityUtils.toByteArray(response.getEntity());
        return new ProtocolResponse(bytes, status, metadata);
    }

    @Override
    public BaseRobotRules getRobotRules(String url) {
        if (this.skipRobots)
            return RobotRulesParser.EMPTY_RULES;
        return robots.getRobotRulesSet(this, url);
    }

    public static void main(String args[]) throws Exception {
        HttpProtocol protocol = new HttpProtocol();
        Config conf = new Config();

        String url = args[0];
        ConfUtils.loadConf(args[1], conf);
        protocol.configure(conf);

        if (!protocol.skipRobots) {
            BaseRobotRules rules = protocol.getRobotRules(url);
            System.out.println("is allowed : " + rules.isAllowed(url));
        }

        Metadata md = new Metadata();
        ProtocolResponse response = protocol.getProtocolOutput(url, md);
        System.out.println(url);
        System.out.println(response.getMetadata());
        System.out.println(response.getStatusCode());
        System.out.println(response.getContent().length);
    }

}