com.github.brandtg.pantopod.crawler.CrawlingEventHandler.java Source code

Java tutorial

Introduction

Here is the source code for com.github.brandtg.pantopod.crawler.CrawlingEventHandler.java

Source

/**
 * Copyright (C) 2015 Greg Brandt (brandt.greg@gmail.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.brandtg.pantopod.crawler;

import com.github.brandtg.pantopod.api.CrawlEvent;
import com.github.brandtg.pantopod.consumer.PantopodEventHandler;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.util.HashSet;
import java.util.Set;

/**
 * Follows all links on a page.
 */
public abstract class CrawlingEventHandler implements PantopodEventHandler {
    private static Logger LOG = LoggerFactory.getLogger(CrawlingEventHandler.class);

    private final HttpClient httpClient;
    private final boolean checkErrors;
    private final boolean traverseDuplicates;

    public CrawlingEventHandler(HttpClient httpClient) {
        this(httpClient, true, false);
    }

    public CrawlingEventHandler(HttpClient httpClient, boolean checkErrors, boolean traverseDuplicates) {
        this.httpClient = httpClient;
        this.checkErrors = checkErrors;
        this.traverseDuplicates = traverseDuplicates;
    }

    @Override
    public Set<CrawlEvent> handle(CrawlEvent event) throws Exception {
        Set<CrawlEvent> nextEvents = new HashSet<>();

        // Get url
        URI url = URI.create(event.getUrl());
        Document dom = null;
        boolean created = false;
        if (!checkErrors || !hasError(url)) {
            HttpGet req = new HttpGet(url);
            HttpResponse res = httpClient.execute(req);

            try {
                if (res.getStatusLine().getStatusCode() == 200) {
                    byte[] domBytes = IOUtils.toByteArray(res.getEntity().getContent());
                    created = handleData(url, domBytes);
                    dom = Jsoup.parse(new String(domBytes));
                } else {
                    LOG.error("Error for {} #=> {}", url, res.getStatusLine().getStatusCode());
                    markError(url, res.getStatusLine().getStatusCode());
                }
            } finally {
                if (res.getEntity() != null) {
                    EntityUtils.consumeQuietly(res.getEntity());
                }
            }
        }

        // Extract links
        if ((created || traverseDuplicates) && dom != null) {
            for (Element element : dom.select("a")) {
                String href = element.attr("href");
                if (href != null) {
                    URI nextUri = getNextUri(url, href, event.getChroot());
                    if (shouldExplore(nextUri) && isSameDomain(url, nextUri) && isDifferentPage(url, nextUri)) {
                        CrawlEvent nextEvent = new CrawlEvent(event);
                        nextEvent.setUrl(nextUri.toString());
                        nextEvent.setParentUrl(event.getUrl());
                        nextEvent.setDepth(event.getDepth() + 1);
                        nextEvents.add(nextEvent);
                        LOG.debug("Exploring {}", nextUri);
                    } else {
                        LOG.debug("Skipping {}", nextUri);
                    }
                }
            }
        }

        return nextEvents;
    }

    private URI getNextUri(URI url, String href, String chroot) throws Exception {
        //    if (href.contains("..")) {
        //
        //      throw new IllegalArgumentException("Relative URI not allowed: " + href);
        //    }

        URI hrefUri = URI.create(href.trim().replaceAll(" ", "+"));

        URIBuilder builder = new URIBuilder();

        builder.setScheme(hrefUri.getScheme() == null ? url.getScheme() : hrefUri.getScheme());
        builder.setHost(hrefUri.getHost() == null ? url.getHost() : hrefUri.getHost());
        builder.setPort(hrefUri.getPort() == -1 ? url.getPort() : hrefUri.getPort());

        if (hrefUri.getPath() != null) {
            StringBuilder path = new StringBuilder();
            if (hrefUri.getHost() == null && chroot != null) {
                path.append(chroot);
            }

            // Ensure no two slashes
            if (hrefUri.getPath() != null && hrefUri.getPath().length() > 0 && hrefUri.getPath().charAt(0) == '/'
                    && path.length() > 0 && path.charAt(path.length() - 1) == '/') {
                path.setLength(path.length() - 1);
            }

            path.append(hrefUri.getPath());

            builder.setPath(chroot == null ? "" : chroot + hrefUri.getPath());
        }
        if (hrefUri.getQuery() != null) {
            builder.setCustomQuery(hrefUri.getQuery());
        }
        if (hrefUri.getFragment() != null) {
            builder.setFragment(hrefUri.getFragment());
        }

        return builder.build();
    }

    private boolean isSameDomain(URI url, URI nextUrl) throws IOException {
        if (nextUrl.getAuthority() == null || nextUrl.getAuthority().equals(url.getAuthority())) {
            return true;
        }
        handleExternalDomain(url, nextUrl);
        return false;
    }

    private boolean isDifferentPage(URI url, URI nextUrl) throws IOException {
        return nextUrl.getPath() != null && !nextUrl.getPath().equals(url.getPath());
    }

    protected abstract void handleExternalDomain(URI srcUrl, URI dstUrl) throws IOException;

    protected abstract boolean handleData(URI url, byte[] data) throws IOException;

    protected abstract boolean shouldExplore(URI url);

    protected abstract boolean hasError(URI url);

    protected abstract void markError(URI url, int errorCode) throws IOException;
}