Example usage for org.apache.commons.httpclient HttpStatus SC_GONE

List of usage examples for org.apache.commons.httpclient HttpStatus SC_GONE

Introduction

In this page you can find the example usage for org.apache.commons.httpclient HttpStatus SC_GONE.

Prototype

int SC_GONE

To view the source code for org.apache.commons.httpclient HttpStatus SC_GONE.

Click Source Link

Document

<tt>410 Gone</tt> (HTTP/1.1 - RFC 2616)

Usage

From source file:org.cloudcrawler.domain.crawler.robotstxt.RobotsTxtService.java

/**
 * This method is used to evaluate if the provided uri is
 * allowed to be crawled against the robots.txt of the website.
 *
 * @param uri//from www.j a  va  2s . com
 * @return
 * @throws Exception
 */
public boolean isAllowedUri(URI uri) throws Exception {
    URIBuilder uriBuilder = new URIBuilder();
    uriBuilder.setScheme(uri.getScheme());
    uriBuilder.setHost(uri.getHost());
    uriBuilder.setUserInfo(uri.getUserInfo());
    uriBuilder.setPath("/robots.txt");

    URI robotsTxtUri = uriBuilder.build();
    BaseRobotRules rules = (BaseRobotRules) cache.get(robotsTxtUri.toString());

    if (rules == null) {
        HttpResponse response = httpService.get(robotsTxtUri);

        try {

            // HACK! DANGER! Some sites will redirect the request to the top-level domain
            // page, without returning a 404. So look for a response which has a redirect,
            // and the fetched content is not plain text, and assume it's one of these...
            // which is the same as not having a robotstxt.txt file.

            String contentType = response.getEntity().getContentType().getValue();
            boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));

            if (response.getStatusLine().getStatusCode() == 404 || !isPlainText) {
                rules = robotsTxtParser.failedFetch(HttpStatus.SC_GONE);
            } else {
                StringWriter writer = new StringWriter();

                IOUtils.copy(response.getEntity().getContent(), writer);

                rules = robotsTxtParser.parseContent(uri.toString(), writer.toString().getBytes(),
                        response.getEntity().getContentType().getValue(), httpService.getUserAgent());

            }
        } catch (Exception e) {
            EntityUtils.consume(response.getEntity());
            throw e;
        }

        EntityUtils.consume(response.getEntity());
        cache.set(robotsTxtUri.toString(), 60 * 60 * 24, rules);
    }

    return rules.isAllowed(uri.toString());
}

From source file:org.opens.tanaguru.util.http.HttpRequestHandler.java

private int computeStatus(int status) {
    switch (status) {
    case HttpStatus.SC_FORBIDDEN:
    case HttpStatus.SC_METHOD_NOT_ALLOWED:
    case HttpStatus.SC_BAD_REQUEST:
    case HttpStatus.SC_UNAUTHORIZED:
    case HttpStatus.SC_PAYMENT_REQUIRED:
    case HttpStatus.SC_NOT_FOUND:
    case HttpStatus.SC_NOT_ACCEPTABLE:
    case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
    case HttpStatus.SC_REQUEST_TIMEOUT:
    case HttpStatus.SC_CONFLICT:
    case HttpStatus.SC_GONE:
    case HttpStatus.SC_LENGTH_REQUIRED:
    case HttpStatus.SC_PRECONDITION_FAILED:
    case HttpStatus.SC_REQUEST_TOO_LONG:
    case HttpStatus.SC_REQUEST_URI_TOO_LONG:
    case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE:
    case HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE:
    case HttpStatus.SC_EXPECTATION_FAILED:
    case HttpStatus.SC_INSUFFICIENT_SPACE_ON_RESOURCE:
    case HttpStatus.SC_METHOD_FAILURE:
    case HttpStatus.SC_UNPROCESSABLE_ENTITY:
    case HttpStatus.SC_LOCKED:
    case HttpStatus.SC_FAILED_DEPENDENCY:
    case HttpStatus.SC_INTERNAL_SERVER_ERROR:
    case HttpStatus.SC_NOT_IMPLEMENTED:
    case HttpStatus.SC_BAD_GATEWAY:
    case HttpStatus.SC_SERVICE_UNAVAILABLE:
    case HttpStatus.SC_GATEWAY_TIMEOUT:
    case HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED:
    case HttpStatus.SC_INSUFFICIENT_STORAGE:
        return 0;
    case HttpStatus.SC_CONTINUE:
    case HttpStatus.SC_SWITCHING_PROTOCOLS:
    case HttpStatus.SC_PROCESSING:
    case HttpStatus.SC_OK:
    case HttpStatus.SC_CREATED:
    case HttpStatus.SC_ACCEPTED:
    case HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION:
    case HttpStatus.SC_NO_CONTENT:
    case HttpStatus.SC_RESET_CONTENT:
    case HttpStatus.SC_PARTIAL_CONTENT:
    case HttpStatus.SC_MULTI_STATUS:
    case HttpStatus.SC_MULTIPLE_CHOICES:
    case HttpStatus.SC_MOVED_PERMANENTLY:
    case HttpStatus.SC_MOVED_TEMPORARILY:
    case HttpStatus.SC_SEE_OTHER:
    case HttpStatus.SC_NOT_MODIFIED:
    case HttpStatus.SC_USE_PROXY:
    case HttpStatus.SC_TEMPORARY_REDIRECT:
        return 1;
    default:/* w w  w .  j a  v a  2 s . c o  m*/
        return 1;
    }
}