com.qwazr.crawler.web.manager.WebCrawlThread.java Source code

Introduction

Here is the source code for com.qwazr.crawler.web.manager.WebCrawlThread.java
Source

/**
 * Copyright 2014-2016 Emmanuel Keller / QWAZR
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 **/
package com.qwazr.crawler.web.manager;

import com.google.common.net.InternetDomainName;
import com.qwazr.cluster.manager.ClusterManager;
import com.qwazr.crawler.web.CurrentURI;
import com.qwazr.crawler.web.driver.BrowserDriver;
import com.qwazr.crawler.web.driver.BrowserDriverBuilder;
import com.qwazr.crawler.web.service.WebCrawlDefinition;
import com.qwazr.crawler.web.service.WebCrawlDefinition.EventEnum;
import com.qwazr.crawler.web.service.WebCrawlDefinition.Script;
import com.qwazr.crawler.web.service.WebCrawlStatus;
import com.qwazr.scripts.ScriptManager;
import com.qwazr.scripts.ScriptRunThread;
import com.qwazr.utils.TimeTracker;
import com.qwazr.utils.WildcardMatcher;
import com.qwazr.utils.server.ServerException;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIBuilder;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.script.ScriptException;
import javax.ws.rs.core.Response.Status;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

public class WebCrawlThread implements Runnable {

    private static final Logger logger = LoggerFactory.getLogger(WebCrawlThread.class);

    private final CurrentSessionImpl session;
    final WebCrawlDefinition crawlDefinition;
    private final InternetDomainName internetDomainName;

    private final List<Matcher> parametersMatcherList;
    private final List<WildcardMatcher> exclusionMatcherList;
    private final List<WildcardMatcher> inclusionMatcherList;

    private BrowserDriver driver = null;

    private final Map<URI, RobotsTxt> robotsTxtMap;
    private final String robotsTxtUserAgent;

    private final TimeTracker timeTracker;

    WebCrawlThread(String sessionName, WebCrawlDefinition crawlDefinition) throws ServerException {
        timeTracker = new TimeTracker();
        this.session = new CurrentSessionImpl(crawlDefinition, sessionName, timeTracker);
        this.crawlDefinition = crawlDefinition;
        if (crawlDefinition.browser_type == null)
            throw new ServerException(Status.NOT_ACCEPTABLE, "The browser_type is missing");
        if (crawlDefinition.entry_url == null)
            throw new ServerException(Status.NOT_ACCEPTABLE, "The entry_url is missing");
        parametersMatcherList = getRegExpMatcherList(crawlDefinition.parameters_patterns);
        exclusionMatcherList = getWildcardMatcherList(crawlDefinition.exclusion_patterns);
        inclusionMatcherList = getWildcardMatcherList(crawlDefinition.inclusion_patterns);
        if (crawlDefinition.robots_txt_enabled != null && crawlDefinition.robots_txt_enabled)
            robotsTxtMap = new HashMap<>();
        else
            robotsTxtMap = null;
        robotsTxtUserAgent = crawlDefinition.robots_txt_useragent == null ? "QWAZR_BOT"
                : crawlDefinition.robots_txt_useragent;
        try {
            URI uri = new URI(crawlDefinition.entry_url);
            String host = uri.getHost();
            if (host == null)
                throw new URISyntaxException(crawlDefinition.entry_url, "No host found.", -1);
            internetDomainName = InternetDomainName.from(host);
        } catch (URISyntaxException e) {
            throw new ServerException(Status.NOT_ACCEPTABLE, e.getMessage());
        } finally {
            timeTracker.next("Initialization");
        }
    }

    private final static List<Matcher> getRegExpMatcherList(List<String> patternList) throws ServerException {
        if (patternList == null || patternList.isEmpty())
            return null;
        try {
            List<Matcher> matcherList = new ArrayList<Matcher>(patternList.size());
            for (String pattern : patternList) {
                Matcher matcher = Pattern.compile(pattern).matcher(StringUtils.EMPTY);
                matcherList.add(matcher);
            }
            return matcherList;
        } catch (PatternSyntaxException e) {
            throw new ServerException(Status.NOT_ACCEPTABLE, e.getMessage());
        }
    }

    private final static List<WildcardMatcher> getWildcardMatcherList(List<String> patternList) {
        if (patternList == null || patternList.isEmpty())
            return null;
        List<WildcardMatcher> matcherList = new ArrayList<WildcardMatcher>(patternList.size());
        for (String pattern : patternList)
            matcherList.add(new WildcardMatcher(pattern));
        return matcherList;
    }

    String getSessionName() {
        return session.getName();
    }

    WebCrawlStatus getStatus() {
        return new WebCrawlStatus(ClusterManager.INSTANCE.myAddress, crawlDefinition.entry_url, session);
    }

    void abort(String reason) {
        session.abort(reason);
    }

    private final static boolean checkRegExpMatcher(String value, List<Matcher> matcherList) {
        for (Matcher matcher : matcherList) {
            matcher.reset(value);
            if (matcher.find())
                return true;
        }
        return false;
    }

    private final static boolean checkWildcardMatcher(String value, List<WildcardMatcher> matcherList) {
        for (WildcardMatcher matcher : matcherList)
            if (matcher.match(value))
                return true;
        return false;
    }

    /**
     * Check the inclusion list. Returns null if the inclusion list is empty.
     *
     * @param uriString
     * @return
     */
    private Boolean matchesInclusion(String uriString) {
        if (inclusionMatcherList == null || inclusionMatcherList.isEmpty())
            return null;
        return checkWildcardMatcher(uriString, inclusionMatcherList);
    }

    /**
     * Check the exclusion list. Returns null if the exclusion list is empty.
     *
     * @param uriString
     * @return
     */
    private Boolean matchesExclusion(String uriString) {
        if (exclusionMatcherList == null || exclusionMatcherList.isEmpty())
            return false;
        return checkWildcardMatcher(uriString, exclusionMatcherList);
    }

    /**
     * Remove the fragment if remove_framents is set to true
     *
     * @param uriBuilder
     */
    private void checkRemoveFragment(URIBuilder uriBuilder) {
        if (crawlDefinition.remove_fragments == null || !crawlDefinition.remove_fragments)
            return;
        uriBuilder.setFragment(null);
    }

    /**
     * Remove any query parameter which match the parameters_matcher list
     *
     * @param uriBuilder
     */
    private void checkRemoveParameter(URIBuilder uriBuilder) {
        if (parametersMatcherList == null || parametersMatcherList.isEmpty())
            return;
        List<NameValuePair> oldParams = uriBuilder.getQueryParams();
        if (oldParams == null || oldParams.isEmpty())
            return;
        uriBuilder.clearParameters();
        for (NameValuePair param : oldParams)
            if (!checkRegExpMatcher(param.getName() + "=" + param.getValue(), parametersMatcherList))
                uriBuilder.addParameter(param.getName(), param.getValue());
    }

    /**
     * Remove the fragment and the query parameters following the configuration
     *
     * @param uri
     * @return
     */
    private URI checkLink(URI uri) {
        URIBuilder uriBuilder = new URIBuilder(uri);
        checkRemoveFragment(uriBuilder);
        checkRemoveParameter(uriBuilder);
        try {
            return uriBuilder.build();
        } catch (URISyntaxException e) {
            logger.warn(e.getMessage(), e);
            return null;
        }
    }

    private Collection<URI> checkLinks(Collection<URI> uris) {
        if (uris == null)
            return null;
        Map<String, URI> linkMap = new LinkedHashMap<String, URI>();
        for (URI linkURI : uris) {
            linkURI = checkLink(linkURI);
            if (linkURI != null)
                linkMap.put(linkURI.toString(), linkURI);
        }
        return linkMap.values();
    }

    private boolean matchesInitialDomain(URI uri) {
        String host = uri.getHost();
        if (StringUtils.isEmpty(host))
            return false;
        if (!InternetDomainName.isValid(host))
            return false;
        return internetDomainName.equals(InternetDomainName.from(host));
    }

    private String scriptBeforeCrawl(CurrentURIImpl currentURI, String uriString)
            throws ServerException, IOException, ClassNotFoundException {
        URI uri = currentURI.getURI();
        if (uriString == null)
            uriString = uri.toString();

        currentURI.setStartDomain(matchesInitialDomain(uri));

        // We check the inclusion/exclusion.
        currentURI.setInInclusion(matchesInclusion(uriString));
        currentURI.setInExclusion(matchesExclusion(uriString));

        if (currentURI.isInInclusion() != null && !currentURI.isInInclusion())
            currentURI.setIgnored(true);

        if (currentURI.isInExclusion() != null && currentURI.isInExclusion())
            currentURI.setIgnored(true);

        script(EventEnum.before_crawl, currentURI);
        return uriString;
    }

    private void crawl(CurrentURIImpl currentURI) {

        if (session.isAborting())
            return;

        URI uri = currentURI.getInitialURI();
        String uriString = uri.toString();
        session.setCurrentURI(uriString, currentURI.getDepth());

        // Check if the URL is well formated
        String scheme = uri.getScheme();
        if (!"http".equalsIgnoreCase(scheme) && !"https".equalsIgnoreCase(scheme)) {
            session.incIgnoredCount();
            currentURI.setIgnored(true);
            if (logger.isInfoEnabled())
                logger.info("Ignored (not http) " + uri);
            return;
        }

        // Load the URL
        if (logger.isInfoEnabled())
            logger.info("Crawling " + uri + " (" + currentURI.getDepth() + ")");
        try {
            timeTracker.next(null);
            driver.get(uriString);
            //if (mainWindow != null && !mainWindow.equals(driver.getWindowHandle()))
            //   driver.switchTo().window(mainWindow);
        } catch (Exception e) {
            session.incErrorCount();
            currentURI.setError(driver, e);
            return;
        } finally {
            timeTracker.next("Driver.getURL");
        }

        try {
            uriString = driver.getCurrentUrl();
            uri = new URI(uriString);
            currentURI.setFinalURI(uri);
        } catch (URISyntaxException e) {
            session.incErrorCount();
            currentURI.setError(driver, e);
            return;
        }

        // Check again with exclusion/inclusion list
        // in case of redirection
        if (currentURI.isRedirected()) {
            if (logger.isInfoEnabled())
                logger.info("Redirected " + currentURI.getInitialURI() + " to " + uriString);
            try {
                scriptBeforeCrawl(currentURI, uriString);
            } catch (Exception e) {
                session.incErrorCount();
                currentURI.setError(driver, e);
                return;
            }
            if (currentURI.isIgnored()) {
                session.incIgnoredCount();
                return;
            }
        }

        // Support of the base/href element
        boolean searchBaseHref = true;
        try {
            searchBaseHref = "text/html".equals(driver.getContentType());
        } catch (WebDriverException e) {
            // OK that's not really an error
        }

        if (searchBaseHref) {
            try {
                WebElement baseElement = driver.findElement(By.tagName("base"));
                if (baseElement != null) {
                    String href = baseElement.getAttribute("href");
                    try {
                        currentURI.setBaseURI(new URI(href));
                    } catch (URISyntaxException e) {
                        if (logger.isWarnEnabled())
                            logger.warn("Invalid URI in base HREF: " + href + " in " + uriString);
                    }
                }
            } catch (org.openqa.selenium.NoSuchElementException e) {
                // OK that's not really an error
            } catch (IllegalStateException e) {
                if (logger.isWarnEnabled())
                    logger.warn("Cannot locate base href for " + uriString + " " + e.getMessage());
            } catch (Exception e) {
                if (logger.isWarnEnabled())
                    logger.warn("Cannot locate base href for " + uriString + " " + e.getMessage());
            }
        }

        int crawledCount = session.incCrawledCount();
        currentURI.setCrawled();
        if (crawlDefinition.max_url_number != null && crawledCount >= crawlDefinition.max_url_number)
            abort("Max URL number reached: " + crawlDefinition.max_url_number);

        // Let's look for the a tags
        Set<String> hrefSet = new LinkedHashSet<String>();
        try {
            timeTracker.next(null);
            driver.findLinks(driver, hrefSet);
        } catch (Exception e) {
            if (logger.isWarnEnabled())
                logger.warn("Cannot extract links from " + uriString, e);
        } finally {
            timeTracker.next("Find links");
        }
        if (hrefSet.isEmpty())
            return;
        ArrayList<URI> uris = new ArrayList<URI>(hrefSet.size());
        currentURI.hrefToURICollection(hrefSet, uris);
        currentURI.setLinks(uris);
        if (logger.isInfoEnabled())
            logger.info("Link founds " + uri + " : " + uris.size());

        ArrayList<URI> filteredURIs = new ArrayList<URI>();
        for (URI u : uris) {
            String us = u.toString();
            Boolean inc = matchesInclusion(us);
            if (inc != null && !inc)
                continue;
            Boolean exc = matchesExclusion(us);
            if (exc != null && exc)
                continue;
            filteredURIs.add(u);
        }
        currentURI.setFilteredLinks(filteredURIs);
    }

    private RobotsTxt.RobotsTxtStatus checkRobotsTxt(CurrentURI currentURI) throws IOException, URISyntaxException,
            NoSuchAlgorithmException, KeyStoreException, KeyManagementException {
        if (robotsTxtMap == null)
            return null;
        timeTracker.next(null);
        try {
            URI uri = currentURI.getURI();
            URI robotsTxtURI = RobotsTxt.getRobotsURI(uri);
            RobotsTxt robotsTxt = robotsTxtMap.get(robotsTxtURI);
            if (robotsTxt == null) {
                robotsTxt = RobotsTxt.download(driver.getProxy(), robotsTxtUserAgent, robotsTxtURI);
                robotsTxtMap.put(robotsTxtURI, robotsTxt);
            }
            return robotsTxt.getStatus(uri);
        } finally {
            timeTracker.next("Robots.txt check");
        }
    }

    private void crawlOne(final Set<URI> crawledURIs, URI uri, final Set<URI> nextLevelURIs, final int depth)
            throws ServerException, IOException, ClassNotFoundException {

        if (session.isAborting())
            return;

        // Check if it has been already crawled
        if (crawledURIs != null) {
            if (crawledURIs.contains(uri))
                return;
            crawledURIs.add(uri);
        }

        CurrentURIImpl currentURI = new CurrentURIImpl(uri, depth);

        // Give the hand to the "before_crawl" scripts
        scriptBeforeCrawl(currentURI, null);

        if (!currentURI.isIgnored()) {

            // Check the robotsTxt status
            try {
                RobotsTxt.RobotsTxtStatus robotsTxtStatus = checkRobotsTxt(currentURI);
                if (robotsTxtStatus != null && !robotsTxtStatus.isCrawlable)
                    currentURI.setIgnored(true);
            } catch (Exception e) {
                session.incErrorCount();
                currentURI.setError(driver, e);
            }

            if (!currentURI.isIgnored() && currentURI.getError() == null) {
                crawl(currentURI);

                // Store the final URI (in case of redirection)
                if (crawledURIs != null)
                    crawledURIs.add(currentURI.getURI());
            }
        }
        script(EventEnum.after_crawl, currentURI);

        Collection<URI> sameLevelLinks = checkLinks(currentURI.getSameLevelLinks());
        if (sameLevelLinks != null)
            for (URI sameLevelURI : sameLevelLinks)
                crawlOne(crawledURIs, sameLevelURI, nextLevelURIs, depth);

        Collection<URI> newLinks = checkLinks(currentURI.getLinks());
        currentURI.setLinks(newLinks);
        if (newLinks != null && nextLevelURIs != null)
            nextLevelURIs.addAll(newLinks);

    }

    private void crawlLevel(Set<URI> crawledURIs, Collection<URI> levelURIs, int depth)
            throws ServerException, IOException, URISyntaxException, NoSuchAlgorithmException, KeyStoreException,
            KeyManagementException, ClassNotFoundException {

        if (session.isAborting())
            return;

        if (levelURIs == null || levelURIs.isEmpty())
            return;

        final Set<URI> nextLevelURIs = new HashSet<URI>();

        // Crawl all URLs from the level
        for (URI uri : levelURIs)
            crawlOne(crawledURIs, uri, nextLevelURIs, depth);

        // Check if we reach the max depth
        depth++;
        if (crawlDefinition.max_depth == null || depth > crawlDefinition.max_depth)
            return;

        // Let's crawl the next level if any
        crawlLevel(crawledURIs, nextLevelURIs, depth);

    }

    private void crawlUrlMap(Set<URI> crawledURIs, Map<String, Integer> urlMap) {

        urlMap.forEach(new BiConsumer<String, Integer>() {
            @Override
            public void accept(String uri, Integer depth) {
                try {
                    crawlOne(crawledURIs, new URI(uri), null, depth);
                } catch (Exception e) {
                    logger.warn("Malformed URI: " + uri);
                }
            }
        });
    }

    /**
     * Execute the scripts related to the passed event.
     *
     * @param event      the expected event
     * @param currentURI the current URI description
     * @return true if the scripts was executed, false if no scripts is attached
     * to the event
     * @throws ServerException        if the execution of the scripts failed
     * @throws IOException            if any I/O exception occurs
     * @throws ClassNotFoundException if the JAVA class is not found
     */
    private boolean script(EventEnum event, CurrentURI currentURI)
            throws ServerException, IOException, ClassNotFoundException {
        if (crawlDefinition.scripts == null)
            return false;
        Script script = crawlDefinition.scripts.get(event);
        if (script == null)
            return false;
        timeTracker.next(null);
        try {
            Map<String, Object> objects = new TreeMap<String, Object>();
            objects.put("session", session);
            if (script.variables != null)
                objects.putAll(script.variables);
            if (driver != null)
                objects.put("driver", driver);
            if (currentURI != null)
                objects.put("current", currentURI);
            ScriptRunThread scriptRunThread = ScriptManager.getInstance().runSync(script.name, objects);
            if (scriptRunThread.getException() != null)
                throw new ServerException(scriptRunThread.getException());
            return true;
        } finally {
            timeTracker.next("Event: " + event.name());
        }
    }

    private void runner() throws URISyntaxException, IOException, ScriptException, ServerException,
            ReflectiveOperationException, NoSuchAlgorithmException, KeyStoreException, KeyManagementException {
        try {
            driver = new BrowserDriverBuilder(crawlDefinition).build();
            script(EventEnum.before_session, null);
            final Set<URI> crawledURIs = new HashSet<>();
            final List<URI> uriList;
            if (crawlDefinition.urls != null && !crawlDefinition.urls.isEmpty())
                crawlUrlMap(crawledURIs, crawlDefinition.urls);
            else {
                crawlLevel(crawledURIs, Arrays.asList(new URI(crawlDefinition.entry_url)), 0);
            }
        } finally {
            try {
                if (driver != null)
                    driver.quit();
            } catch (Exception e) {
                logger.warn(e.getMessage(), e);
            }
            script(EventEnum.after_session, null);
        }
    }

    @Override
    final public void run() {
        try {
            runner();
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
        } finally {
            WebCrawlerManager.INSTANCE.removeSession(this);
        }
    }
}