com.trickl.crawler.protocol.http.HttpProtocol.java Source code

Java tutorial

Introduction

Here is the source code for com.trickl.crawler.protocol.http.HttpProtocol.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.trickl.crawler.protocol.http;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.droids.api.ManagedContentEntity;
import org.apache.droids.api.Protocol;
import org.apache.droids.norobots.ContentLoader;
import org.apache.droids.norobots.NoRobotClient;
import org.apache.droids.norobots.NoRobotException;
import org.apache.droids.protocol.http.DroidsHttpClient;
import org.apache.droids.protocol.http.HttpClientContentLoader;
import org.apache.droids.protocol.http.HttpContentEntity;
import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.entity.StringEntity;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.CoreProtocolPNames;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HttpProtocol implements Protocol {

    private final Logger log = LoggerFactory.getLogger(HttpProtocol.class);

    private final HttpClient httpclient;
    private final ContentLoader contentLoader;

    private boolean forceAllow = false;
    private String method = HttpGet.METHOD_NAME;
    private Map<String, Object> postData = new HashMap<String, Object>();
    private Map<String, String> headerData = new HashMap<String, String>();
    private String userAgent = "Apache-Droids/1.1 (java 1.5)";

    public HttpProtocol(final HttpClient httpclient) {
        super();
        this.httpclient = httpclient;
        this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent);
        this.contentLoader = new HttpClientContentLoader(httpclient);
    }

    public HttpProtocol() {
        this(new DroidsHttpClient());
    }

    @Override
    public ManagedContentEntity load(URI uri) throws IOException {
        HttpRequestBase httpRequest;

        if (method.equalsIgnoreCase(HttpPost.METHOD_NAME)) {
            HttpPost httpPost = new HttpPost(uri);

            // Add header data
            for (Map.Entry<String, String> headerDataEntry : headerData.entrySet()) {
                httpPost.setHeader(headerDataEntry.getKey(), headerDataEntry.getValue());
            }

            // Add post data
            String contentType = headerData.get("Content-Type");
            if (contentType == null || "application/x-www-form-urlencoded".equalsIgnoreCase(contentType)) {
                List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
                for (Map.Entry<String, Object> postDataEntry : postData.entrySet()) {
                    nameValuePairs.add(
                            new BasicNameValuePair(postDataEntry.getKey(), postDataEntry.getValue().toString()));
                }
                httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs));
            } else if ("application/json".equalsIgnoreCase(contentType)) {
                ObjectMapper mapper = new ObjectMapper();
                StringEntity se;
                try {
                    String jsonString = mapper.writeValueAsString(postData);
                    se = new StringEntity(jsonString);
                    httpPost.setEntity(se);
                } catch (JsonGenerationException ex) {
                    log.error("Failed to generate JSON.", ex);
                } catch (JsonMappingException ex) {
                    log.error("Failed to generate JSON.", ex);
                }
            }
            httpRequest = httpPost;
        } else {
            httpRequest = new HttpGet(uri);
        }

        HttpResponse response = httpclient.execute(httpRequest);
        StatusLine statusline = response.getStatusLine();
        if (statusline.getStatusCode() >= HttpStatus.SC_BAD_REQUEST) {
            httpRequest.abort();
            throw new HttpResponseException(statusline.getStatusCode(), statusline.getReasonPhrase());
        }
        HttpEntity entity = response.getEntity();
        if (entity == null) {
            // Should _almost_ never happen with HTTP GET requests.
            throw new ClientProtocolException("Empty entity");
        }
        long maxlen = httpclient.getParams().getLongParameter(DroidsHttpClient.MAX_BODY_LENGTH, 0);
        return new HttpContentEntity(entity, maxlen);
    }

    @Override
    public boolean isAllowed(URI uri) throws IOException {
        if (forceAllow) {
            return forceAllow;
        }

        URI baseURI;
        try {
            baseURI = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), "/", null, null);
        } catch (URISyntaxException ex) {
            log.error("Unable to determine base URI for " + uri);
            return false;
        }

        NoRobotClient nrc = new NoRobotClient(contentLoader, userAgent);
        try {
            nrc.parse(baseURI);
        } catch (NoRobotException ex) {
            log.error("Failure parsing robots.txt: " + ex.getMessage());
            return false;
        }
        boolean test = nrc.isUrlAllowed(uri);
        if (log.isInfoEnabled()) {
            log.info(uri + " is " + (test ? "allowed" : "denied"));
        }
        return test;
    }

    public String getUserAgent() {
        return userAgent;
    }

    public void setUserAgent(String userAgent) {
        this.userAgent = userAgent;
        this.httpclient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent);
    }

    /**
     * You can force that a site is allowed (ignoring the robots.txt). This should
     * only be used on server that you control and where you have the permission
     * to ignore the robots.txt.
     * 
     * @return <code>true</code> if you are rude and ignore robots.txt.
     *         <code>false</code> if you are playing nice.
     */
    public boolean isForceAllow() {
        return forceAllow;
    }

    /**
     * You can force that a site is allowed (ignoring the robot.txt). This should
     * only be used on server that you control and where you have the permission
     * to ignore the robots.txt.
     * 
     * @param forceAllow
     *                if you want to force an allow and ignore the robot.txt set
     *                to <code>true</code>. If you want to obey the rules and
     *                be polite set to <code>false</code>.
     */
    public void setForceAllow(boolean forceAllow) {
        this.forceAllow = forceAllow;
    }

    protected HttpClient getHttpClient() {
        return this.httpclient;
    }

    /**
     * @return the method
     */
    public String getMethod() {
        return method;
    }

    /**
     * @param method the method to set
     */
    public void setMethod(String method) {
        this.method = method;
    }

    /**
     * @return the postData
     */
    public Map<String, Object> getPostData() {
        return postData;
    }

    /**
     * @param postData the postData to set
     */
    public void setPostData(Map<String, Object> postData) {
        this.postData = postData;
    }

    public void setHeaderData(Map<String, String> headerData) {
        this.headerData = headerData;
    }
}