com.esri.gpt.framework.http.crawl.HttpCrawlRequest.java Source code

Introduction

Here is the source code for com.esri.gpt.framework.http.crawl.HttpCrawlRequest.java
Source

/* See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * Esri Inc. licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.esri.gpt.framework.http.crawl;

import com.esri.gpt.framework.http.HttpClientException;
import com.esri.gpt.framework.http.HttpClientRequest;
import com.esri.gpt.framework.robots.Access;
import com.esri.gpt.framework.robots.Bots;
import static com.esri.gpt.framework.robots.BotsUtils.parser;
import static com.esri.gpt.framework.robots.BotsUtils.requestAccess;
import static com.esri.gpt.framework.robots.BotsUtils.transformUrl;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.logging.Logger;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.httpclient.HttpMethodBase;

/**
 * Http crawl request.
 */
public class HttpCrawlRequest extends HttpClientRequest {

    private static final Logger LOG = Logger.getLogger(HttpCrawlRequest.class.getName());
    private final Bots bots;

    /**
     * Creates instance of the request.
     * @param robotsTxt robots.txt or <code>null</code> if robots not available
     */
    public HttpCrawlRequest(Bots robotsTxt) {
        this.bots = robotsTxt;
    }

    @Override
    public void execute() throws IOException {
        this.adviseRobotsTxt();
        super.execute();
    }

    @Override
    public String getUrl() {
        String url = super.getUrl();
        // if robots.txt available and "host" attribute available then update 
        // url to redicrect to a different end-point
        return transformUrl(bots, url);
    }

    /**
     * Gets throttle delay.
     * <p>
     * Throttle delay might be a result of "Crawl-Delay" value read from robots.txt
     * or it might be override to implement custom throttle policy.
     * @param bots robots.txt or {@code null} if robots.txt unavailable
     * @return throttle delay in milliseconds or {@code null} if no throttling required.
     */
    protected Long resolveThrottleDelay(Bots bots) {
        return bots != null && bots.getCrawlDelay() != null ? 1000L * bots.getCrawlDelay() : null;
    }

    private void adviseRobotsTxt() throws IOException {
        if (bots != null) {
            String url = getRelativePath();
            LOG.fine(String.format("Evaluating access to %s using robots.txt", getUrl()));
            Access access = requestAccess(bots, url);
            if (!access.hasAccess()) {
                LOG.info(String.format("Access to %s disallowed by robots.txt", getUrl()));
                throw new HttpClientException(HttpServletResponse.SC_FORBIDDEN,
                        String.format("Access to %s disallowed by robots.txt", getUrl()));
            }
            LOG.fine(String.format("Access to %s allowed by robots.txt", getUrl()));
            CrawlLocker.getInstance().enterServer(getProtocolHostPort(), resolveThrottleDelay(bots));
        }
    }

    private String getProtocolHostPort() throws MalformedURLException {
        URL u = new URL(getUrl());
        return String.format("%s://%s%s", u.getProtocol(), u.getHost(), u.getPort() >= 0 ? ":" + u.getPort() : "");
    }

    private String getRelativePath() throws MalformedURLException {
        URL u = new URL(getUrl());
        return String.format("%s%s%s", u.getPath() != null ? u.getPath() : "/",
                u.getQuery() != null ? "?" + u.getQuery() : "", u.getRef() != null ? "#" + u.getRef() : "");
    }
}