List of usage examples for org.apache.commons.httpclient HttpStatus SC_GONE
int SC_GONE
To view the source code for org.apache.commons.httpclient HttpStatus SC_GONE.
Click Source Link
From source file:org.cloudcrawler.domain.crawler.robotstxt.RobotsTxtService.java
/** * This method is used to evaluate if the provided uri is * allowed to be crawled against the robots.txt of the website. * * @param uri//from www.j a va 2s . com * @return * @throws Exception */ public boolean isAllowedUri(URI uri) throws Exception { URIBuilder uriBuilder = new URIBuilder(); uriBuilder.setScheme(uri.getScheme()); uriBuilder.setHost(uri.getHost()); uriBuilder.setUserInfo(uri.getUserInfo()); uriBuilder.setPath("/robots.txt"); URI robotsTxtUri = uriBuilder.build(); BaseRobotRules rules = (BaseRobotRules) cache.get(robotsTxtUri.toString()); if (rules == null) { HttpResponse response = httpService.get(robotsTxtUri); try { // HACK! DANGER! Some sites will redirect the request to the top-level domain // page, without returning a 404. So look for a response which has a redirect, // and the fetched content is not plain text, and assume it's one of these... // which is the same as not having a robotstxt.txt file. String contentType = response.getEntity().getContentType().getValue(); boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain")); if (response.getStatusLine().getStatusCode() == 404 || !isPlainText) { rules = robotsTxtParser.failedFetch(HttpStatus.SC_GONE); } else { StringWriter writer = new StringWriter(); IOUtils.copy(response.getEntity().getContent(), writer); rules = robotsTxtParser.parseContent(uri.toString(), writer.toString().getBytes(), response.getEntity().getContentType().getValue(), httpService.getUserAgent()); } } catch (Exception e) { EntityUtils.consume(response.getEntity()); throw e; } EntityUtils.consume(response.getEntity()); cache.set(robotsTxtUri.toString(), 60 * 60 * 24, rules); } return rules.isAllowed(uri.toString()); }
From source file:org.opens.tanaguru.util.http.HttpRequestHandler.java
private int computeStatus(int status) { switch (status) { case HttpStatus.SC_FORBIDDEN: case HttpStatus.SC_METHOD_NOT_ALLOWED: case HttpStatus.SC_BAD_REQUEST: case HttpStatus.SC_UNAUTHORIZED: case HttpStatus.SC_PAYMENT_REQUIRED: case HttpStatus.SC_NOT_FOUND: case HttpStatus.SC_NOT_ACCEPTABLE: case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED: case HttpStatus.SC_REQUEST_TIMEOUT: case HttpStatus.SC_CONFLICT: case HttpStatus.SC_GONE: case HttpStatus.SC_LENGTH_REQUIRED: case HttpStatus.SC_PRECONDITION_FAILED: case HttpStatus.SC_REQUEST_TOO_LONG: case HttpStatus.SC_REQUEST_URI_TOO_LONG: case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE: case HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE: case HttpStatus.SC_EXPECTATION_FAILED: case HttpStatus.SC_INSUFFICIENT_SPACE_ON_RESOURCE: case HttpStatus.SC_METHOD_FAILURE: case HttpStatus.SC_UNPROCESSABLE_ENTITY: case HttpStatus.SC_LOCKED: case HttpStatus.SC_FAILED_DEPENDENCY: case HttpStatus.SC_INTERNAL_SERVER_ERROR: case HttpStatus.SC_NOT_IMPLEMENTED: case HttpStatus.SC_BAD_GATEWAY: case HttpStatus.SC_SERVICE_UNAVAILABLE: case HttpStatus.SC_GATEWAY_TIMEOUT: case HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED: case HttpStatus.SC_INSUFFICIENT_STORAGE: return 0; case HttpStatus.SC_CONTINUE: case HttpStatus.SC_SWITCHING_PROTOCOLS: case HttpStatus.SC_PROCESSING: case HttpStatus.SC_OK: case HttpStatus.SC_CREATED: case HttpStatus.SC_ACCEPTED: case HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION: case HttpStatus.SC_NO_CONTENT: case HttpStatus.SC_RESET_CONTENT: case HttpStatus.SC_PARTIAL_CONTENT: case HttpStatus.SC_MULTI_STATUS: case HttpStatus.SC_MULTIPLE_CHOICES: case HttpStatus.SC_MOVED_PERMANENTLY: case HttpStatus.SC_MOVED_TEMPORARILY: case HttpStatus.SC_SEE_OTHER: case HttpStatus.SC_NOT_MODIFIED: case HttpStatus.SC_USE_PROXY: case HttpStatus.SC_TEMPORARY_REDIRECT: return 1; default:/* w w w . j a v a 2 s . c o m*/ return 1; } }