edu.xiyou.fruits.WebCrawler.net.HttpRequest.java Source code

Java tutorial

Introduction

Here is the source code for edu.xiyou.fruits.WebCrawler.net.HttpRequest.java

Source

/*
 *Copyright (c) 2015 Andrew-Wang. 
 *
 *Licensed under the Apache License, Version 2.0 (the "License");
 *you may not use this file except in compliance with the License.
 *You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *Unless required by applicable law or agreed to in writing, software
 *distributed under the License is distributed on an "AS IS" BASIS,
 *WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *See the License for the specific language governing permissions and
 *limitations under the License.
 */
package edu.xiyou.fruits.WebCrawler.net;

import edu.xiyou.fruits.WebCrawler.parser.CrawlDatum;
import edu.xiyou.fruits.WebCrawler.parser.Html;
import edu.xiyou.fruits.WebCrawler.parser.Response;
import edu.xiyou.fruits.WebCrawler.utils.Config;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.nio.charset.Charset;
import java.util.Objects;

/**
 * Created by andrew on 15-5-20.
 */
public class HttpRequest implements Request {

    private Logger logger = LoggerFactory.getLogger(this.getClass());

    private PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();

    {
        connectionManager.setMaxTotal(200);
        connectionManager.setDefaultMaxPerRoute(3);
    }

    private HttpClient client = null;
    private ResponseHandler<Response> handler = new ResponseHandler() {
        @Override
        public Object handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
            HttpEntity entity = response.getEntity();

            if (entity == null) {
                throw new ClientProtocolException("entry is null");
            }

            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                Charset charset = ContentType.getOrDefault(entity).getCharset();
                if (charset == null) {
                    charset = Consts.ISO_8859_1;
                }
                InputStreamReader isr = new InputStreamReader(entity.getContent(), charset);
                BufferedReader br = new BufferedReader(isr);
                StringBuffer sb = new StringBuffer();
                char[] buffer = new char[2048];
                while (br.read(buffer) != -1) {
                    sb.append(buffer);
                }
                Html html = new Html();
                html.setContent(sb.toString().getBytes());
                html.setStatusLine(response.getStatusLine());
                html.setHeaders(response.getAllHeaders());
                html.setProtocolVersion(response.getProtocolVersion());
                return html;
            } else {
                logger.info("Response Code is :" + response.getStatusLine().getStatusCode());
            }

            return null;
        }
    };

    public HttpRequest() {
        RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(Config.TTIME_OUT)
                .setSocketTimeout(Config.TTIME_OUT).build();
        client = HttpClients.custom().setUserAgent(
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36")
                .setRetryHandler(new DefaultHttpRequestRetryHandler(Config.RETRY, true))
                .setRedirectStrategy(new LaxRedirectStrategy()).setDefaultRequestConfig(requestConfig)
                .setConnectionManager(connectionManager).build();
    }

    private boolean checkCrawlDatum(CrawlDatum datum) {
        if (datum != null)
            return true;
        return false;
    }

    @Override
    public Response getResponse(CrawlDatum datum) throws IllegalArgumentException {
        if (!checkCrawlDatum(datum)) {
            logger.info("datum is null");
            throw new IllegalArgumentException("illegal format");
        }
        String url = datum.getUrl();

        HttpGet httpGet = new HttpGet(url);
        try {
            Response obj = client.execute(httpGet, handler);
            if (obj != null) {
                ((Html) obj).setUrl(url);
                return obj;
            }
        } catch (ClientProtocolException e) {
            logger.info("ClientProtocolException  ");
            e.printStackTrace();
        } catch (SocketTimeoutException e) {
            logger.error(e.getMessage());
        } catch (IOException e) {
            logger.info("IOException   ");
            e.printStackTrace();
        }

        return null;
    }

    //    public static void main(String[] args) {
    //        HttpRequest request = new HttpRequest();
    //        CrawlDatum datum = new CrawlDatum("http://www.importnew.com/all-posts");
    //
    //        System.out.println(new String(request.getResponse(datum).getContent()));
    //    }
}