me.zhuoran.crawler4j.crawler.http.HttpClient.java Source code

Java tutorial

Introduction

Here is the source code for me.zhuoran.crawler4j.crawler.http.HttpClient.java

Source

/*
 * Copyright (c) 2012 Zhuoran Wang <zoran.wang@gmail.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package me.zhuoran.crawler4j.crawler.http;

import java.io.IOException;

import me.zhuoran.crawler4j.crawler.URL;
import me.zhuoran.crawler4j.crawler.config.CrawlerConfig;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Simple HttpClient Wrapper Class
 * 
 * @author Zoran
 *
 */
public class HttpClient {

    private static Logger logger = LoggerFactory.getLogger(HttpClient.class);

    /**
     * Do get http request use custom header.
     * @param config the CrawlerConfig instance
     * @param url the target url instance
     * @param header the custom http header
     * @return HttpFetchResult the result of fetch
     */
    public HttpFetchResult requestHttpGet(final CrawlerConfig config, final URL url, final Header header) {
        final HttpGet req = new HttpGet(url.toFullString());
        if (header != null) {
            req.addHeader(header);
        }
        return doGet(config.getName(), config.getDefaultCharset(), url, req);
    }

    /**
     * Do get http request.
     * @param threadName the crawler name
     * @param defaultCharset the crawler charset 
     * @param url the target url
     * @return HttpFetchResult the result of fetch
     */
    public HttpFetchResult requestHttpGet(final String threadName, final String defaultCharset, final URL url) {
        final HttpGet req = new HttpGet(url.toFullString());
        return doGet(threadName, defaultCharset, url, req);
    }

    public HttpFetchResult requestHttpGet(final CrawlerConfig config, final URL url) {
        final HttpGet req = new HttpGet(url.toFullString());
        return doGet(config.getName(), config.getDefaultCharset(), url, req);
    }

    public HttpFetchResult requestHttpGet(final URL url) {
        final HttpGet req = new HttpGet(url.toFullString());
        return doGet("null", "utf-8", url, req);
    }

    private HttpFetchResult doGet(final String threadName, final String defaultCharset, final URL url,
            final HttpGet req) {
        int retry = 0;//After the execute request failure of the number of retries
        StatusLine status = null;
        HttpEntity entity = null;
        try {
            while (retry++ < 5) {
                try {
                    HttpContext localContext = new BasicHttpContext();
                    HttpResponse rsp = HttpConnectionManager.getHttpClient().execute(req, localContext);
                    entity = rsp.getEntity();
                    status = rsp.getStatusLine();
                    if (status.getStatusCode() != HttpStatus.SC_OK) {
                        req.abort();
                        continue;
                    }
                    if (entity != null) {
                        logger.debug(threadName + " request get " + url.toFullString() + " " + rsp.getStatusLine());
                        return new HttpFetchResult(rsp, defaultCharset);
                    } else {
                        req.abort();
                    }
                } catch (IOException ex) {
                    // In case of an IOException the connection will be released
                    // back to the connection manager automatically
                    continue;
                } catch (RuntimeException e) {
                    req.abort();
                    logger.error(threadName + " request " + url.toFullString() + " " + e.getMessage());
                    e.printStackTrace();
                    continue;
                } catch (Throwable e) {
                    logger.error(threadName + " request " + url.toFullString() + " " + e.getMessage());
                    e.printStackTrace();
                }
            }
        } finally {
            try {
                if (entity == null && req != null) {
                    req.abort();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        logger.warn(threadName + " request get " + url.toFullString() + " " + status);
        return null;
    }

}