edu.uci.ics.crawler4j.crawler.CrawlController.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.crawler4j.crawler.CrawlController.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.uci.ics.crawler4j.crawler;

import java.io.File;
import java.io.IOException;

import java.lang.Thread.State;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;

import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.IO;

// TODO: Auto-generated Javadoc
/**
 * The Class CrawlController.
 * 
 * @author Yasser Ganjisaffar <yganjisa at uci dot edu>
 */

public final class CrawlController {

    /** The Constant logger. */
    private static final Logger logger = Logger.getLogger(CrawlController.class.getName());

    /** The env. */
    private Environment env;

    /** The crawlers local data. */
    private List<Object> crawlersLocalData = new ArrayList<Object>();

    /** The base url to start crawling from. */
    private String baseUrl;

    private static boolean frontierDirDeleted = false;
    private static final int DELAY = 20;

    /**
     * Gets the crawlers local data.
     * 
     * @return the crawlers local data
     */
    public List<Object> getCrawlersLocalData() {
        return crawlersLocalData;
    }

    /** The threads. */
    List<Thread> threads;

    /**
     * Instantiates a new crawl controller.
     * 
     * @param storageFolder
     *            the storage folder
     * @throws Exception
     *             the exception
     */
    public CrawlController(String storageFolder) throws Exception {
        this(storageFolder, Configurations.getBooleanProperty("crawler.enable_resume", true));
    }

    /**
     * Instantiates a new crawl controller.
     * 
     * @param storageFolder
     *            the storage folder
     * @param resumable
     *            the resumable
     * @throws Exception
     *             the exception
     */
    public CrawlController(String storageFolder, boolean resumable) throws Exception {
        deleteFrontierDb(storageFolder);

        File folder = new File(storageFolder);
        if (!folder.exists()) {
            folder.mkdirs();
        }

        EnvironmentConfig envConfig = new EnvironmentConfig();
        envConfig.setAllowCreate(true);
        envConfig.setTransactional(resumable);
        envConfig.setLocking(resumable);

        File envHome = new File(storageFolder + "/frontier");
        if (!envHome.exists()) {
            envHome.mkdir();
        }

        /*
         * if (!resumable) { IO.deleteFolderContents(envHome); }
         */

        env = new Environment(envHome, envConfig);

        Frontier.init(env, resumable);

        DocIDServer.init(env, resumable);

        PageFetcher.startConnectionMonitorThread();
    }

    private synchronized void deleteFrontierDb(String storageFolder) throws IOException {
        if (!frontierDirDeleted) {
            File frontierDb = new File(storageFolder + "/frontier");
            logger.info("Deleting : " + frontierDb.getAbsolutePath());
            FileUtils.deleteDirectory(frontierDb);
            frontierDirDeleted = true;
        }
    }

    /**
     * Start.
     * 
     * @param <T>
     *            the generic type
     * @param _c
     *            the _c
     * @param numberOfCrawlers
     *            the number of crawlers
     */
    public <T extends WebCrawler> void start(Class<T> _c, int numberOfCrawlers) {
        try {

            crawlersLocalData.clear();
            threads = new ArrayList<Thread>();
            List<T> crawlers = new ArrayList<T>();
            int numberofCrawlers = numberOfCrawlers;
            for (int i = 1; i <= numberofCrawlers; i++) {
                T crawler = _c.newInstance();
                Thread thread = new Thread(crawler, "Crawler " + i);
                logger.info("Thread state1 = " + thread.getState().toString());

                crawler.setThread(thread);
                crawler.setMyId(i);
                crawler.setMyController(this);
                thread.start();
                logger.info("Thread state2 = " + thread.getState().toString());
                crawlers.add(crawler);
                threads.add(thread);
                logger.info("Crawler " + i + " started.");
            }

            while (true) {
                sleep(DELAY);
                boolean someoneIsWorking = false;

                for (int i = 0; i < threads.size(); i++) {
                    Thread thread = threads.get(i);
                    if (!thread.isAlive()) {
                        recreateThread(_c, crawlers, i);
                    } else if (thread.getState() == State.RUNNABLE) {
                        someoneIsWorking = true;
                        logger.info("Thread " + i + " was RUNNABLE.");
                    } else if (thread.getState() == State.WAITING) {
                        logger.info("Thread " + i + " was WAITING.");
                        // thread.interrupt();
                        // thread.join();
                    } else {
                        logger.info("Thread " + i + thread.getState().toString());
                        // recreateThread(_c, crawlers, i);
                    }
                }

                if (!someoneIsWorking) {
                    // Make sure again that none of the threads are alive.
                    logger.info("It looks like no thread is working, waiting for 20 second to make sure...");
                    sleep(DELAY);

                    if (!isAnyThreadWorking()) {
                        long queueLength = Frontier.getQueueLength();
                        if (queueLength > 0) {
                            continue;
                        }
                        logger.info(
                                "No thread is working and no more URLs are in queue waiting for another 20 second to make sure...");
                        sleep(DELAY);
                        queueLength = Frontier.getQueueLength();
                        if (queueLength > 0) {
                            continue;
                        }
                        logger.info("All of the crawlers are stopped. Finishing the process...");
                        for (T crawler : crawlers) {
                            crawler.onBeforeExit();
                            crawlersLocalData.add(crawler.getMyLocalData());
                        }

                        // At this step, frontier notifies the threads that were waiting for new URLs and they should
                        // stop
                        // We will wait a few seconds for them and then return.
                        Frontier.finish();
                        logger.info("Waiting for 1 second before final clean up...");
                        sleep(DELAY);

                        try {
                            Frontier.close();
                            env.close();

                        } catch (Exception e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }

                        /*
                         * for (int i = 0; i < threads.size(); i++) { Thread thread = threads.get(i);
                         * logger.info("Thread state = " + thread.getState().toString()); if (thread.isAlive()) {
                         * logger.info("Wait for live thread to die"); thread.join(); }
                         * 
                         * }
                         */
                        // PageFetcher.stopConnectionMonitorThread();
                        return;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Recreate thread.
     * 
     * @param <T>
     *            the generic type
     * @param _c
     *            the _c
     * @param crawlers
     *            the crawlers
     * @param i
     *            the i
     * @throws InstantiationException
     *             the instantiation exception
     * @throws IllegalAccessException
     *             the illegal access exception
     */
    private <T extends WebCrawler> void recreateThread(Class<T> _c, List<T> crawlers, int i)
            throws InstantiationException, IllegalAccessException {
        Thread thread;
        logger.info("Thread " + i + " was dead, I'll recreate it.");
        T crawler = _c.newInstance();
        thread = new Thread(crawler, "Crawler " + (i + 1));
        logger.info("Thread state3 = " + thread.getState().toString());
        threads.remove(i);
        threads.add(i, thread);
        crawler.setThread(thread);
        crawler.setMyId(i + 1);
        crawler.setMyController(this);
        thread.start();
        logger.info("Thread state4 = " + thread.getState().toString());
        crawlers.remove(i);
        crawlers.add(i, crawler);
    }

    /**
     * Sleep.
     * 
     * @param seconds
     *            the seconds
     */
    private void sleep(int seconds) {
        try {
            Thread.sleep(seconds * 1000);
        } catch (InterruptedException e) {
            // We've been interrupted: no more messages.
            return;
        }
    }

    /**
     * Checks if is any thread working.
     * 
     * @return true, if is any thread working
     */
    private boolean isAnyThreadWorking() {
        boolean someoneIsWorking = false;
        for (int i = 0; i < threads.size(); i++) {
            Thread thread = threads.get(i);
            if (thread.isAlive() && thread.getState() == State.RUNNABLE) {
                someoneIsWorking = true;
            }
        }
        return someoneIsWorking;
    }

    /**
     * Adds the seed.
     * 
     * @param pageUrl
     *            the page url
     */
    public void addSeed(String pageUrl) {
        String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
        if (canonicalUrl == null) {
            logger.error("Invalid seed URL: " + pageUrl);
            return;
        }
        int docid = DocIDServer.getDocID(canonicalUrl);
        if (docid > 0) {
            // This URL is already seen.
            return;
        }

        WebURL webUrl = new WebURL();
        webUrl.setURL(canonicalUrl);
        docid = DocIDServer.getNewDocID(canonicalUrl);
        webUrl.setDocid(docid);
        webUrl.setDepth((short) 0);
        if (!RobotstxtServer.allows(webUrl)) {
            logger.info("Robots.txt does not allow this seed: " + pageUrl);
        } else {
            Frontier.schedule(webUrl);
        }

        saveBaseUrl(pageUrl);

    }

    /**
     * Save base url.
     * 
     * @param pageUrl
     *            the page url
     */
    private void saveBaseUrl(String pageUrl) {
        int lastPos = pageUrl.indexOf("//") + 2;
        lastPos = pageUrl.indexOf("/", lastPos);
        if (lastPos == -1) {
            baseUrl = pageUrl;
        } else {
            baseUrl = pageUrl.substring(0, lastPos);
            int pos = baseUrl.indexOf(".");
            if (pos > 0) {
                baseUrl = baseUrl.substring(pos + 1, baseUrl.length());
            }

        }
        logger.info("baseUrl =: " + baseUrl);
    }

    /**
     * Sets the politeness delay.
     * 
     * @param milliseconds
     *            the new politeness delay
     */
    public void setPolitenessDelay(int milliseconds) {
        if (milliseconds < 0) {
            return;
        }
        if (milliseconds > 10000) {
            milliseconds = 10000;
        }
        PageFetcher.setPolitenessDelay(milliseconds);
    }

    /**
     * Sets the maximum crawl depth.
     * 
     * @param depth
     *            the new maximum crawl depth
     * @throws Exception
     *             the exception
     */
    public void setMaximumCrawlDepth(int depth) throws Exception {
        if (depth < -1) {
            throw new Exception(
                    "Maximum crawl depth should be either a positive number or -1 for unlimited depth.");
        }
        if (depth > Short.MAX_VALUE) {
            throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE);
        }
        WebCrawler.setMaximumCrawlDepth((short) depth);
    }

    /**
     * Sets the maximum pages to fetch.
     * 
     * @param max
     *            the new maximum pages to fetch
     */
    public void setMaximumPagesToFetch(int max) {
        Frontier.setMaximumPagesToFetch(max);
    }

    /**
     * Sets the proxy.
     * 
     * @param proxyHost
     *            the proxy host
     * @param proxyPort
     *            the proxy port
     */
    public void setProxy(String proxyHost, int proxyPort) {
        PageFetcher.setProxy(proxyHost, proxyPort);
    }

    /**
     * Sets the proxy.
     * 
     * @param proxyHost
     *            the proxy host
     * @param proxyPort
     *            the proxy port
     * @param username
     *            the username
     * @param password
     *            the password
     */
    public static void setProxy(String proxyHost, int proxyPort, String username, String password) {
        PageFetcher.setProxy(proxyHost, proxyPort, username, password);
    }

    /**
     * Gets the base url.
     * 
     * @return the base url
     */
    public String getBaseUrl() {
        return baseUrl;
    }

}