Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.crawler; import java.io.File; import java.io.IOException; import java.lang.Thread.State; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; import edu.uci.ics.crawler4j.frontier.DocIDServer; import edu.uci.ics.crawler4j.frontier.Frontier; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import edu.uci.ics.crawler4j.url.URLCanonicalizer; import edu.uci.ics.crawler4j.url.WebURL; import edu.uci.ics.crawler4j.util.IO; // TODO: Auto-generated Javadoc /** * The Class CrawlController. * * @author Yasser Ganjisaffar <yganjisa at uci dot edu> */ public final class CrawlController { /** The Constant logger. */ private static final Logger logger = Logger.getLogger(CrawlController.class.getName()); /** The env. */ private Environment env; /** The crawlers local data. */ private List<Object> crawlersLocalData = new ArrayList<Object>(); /** The base url to start crawling from. */ private String baseUrl; private static boolean frontierDirDeleted = false; private static final int DELAY = 20; /** * Gets the crawlers local data. * * @return the crawlers local data */ public List<Object> getCrawlersLocalData() { return crawlersLocalData; } /** The threads. */ List<Thread> threads; /** * Instantiates a new crawl controller. * * @param storageFolder * the storage folder * @throws Exception * the exception */ public CrawlController(String storageFolder) throws Exception { this(storageFolder, Configurations.getBooleanProperty("crawler.enable_resume", true)); } /** * Instantiates a new crawl controller. * * @param storageFolder * the storage folder * @param resumable * the resumable * @throws Exception * the exception */ public CrawlController(String storageFolder, boolean resumable) throws Exception { deleteFrontierDb(storageFolder); File folder = new File(storageFolder); if (!folder.exists()) { folder.mkdirs(); } EnvironmentConfig envConfig = new EnvironmentConfig(); envConfig.setAllowCreate(true); envConfig.setTransactional(resumable); envConfig.setLocking(resumable); File envHome = new File(storageFolder + "/frontier"); if (!envHome.exists()) { envHome.mkdir(); } /* * if (!resumable) { IO.deleteFolderContents(envHome); } */ env = new Environment(envHome, envConfig); Frontier.init(env, resumable); DocIDServer.init(env, resumable); PageFetcher.startConnectionMonitorThread(); } private synchronized void deleteFrontierDb(String storageFolder) throws IOException { if (!frontierDirDeleted) { File frontierDb = new File(storageFolder + "/frontier"); logger.info("Deleting : " + frontierDb.getAbsolutePath()); FileUtils.deleteDirectory(frontierDb); frontierDirDeleted = true; } } /** * Start. * * @param <T> * the generic type * @param _c * the _c * @param numberOfCrawlers * the number of crawlers */ public <T extends WebCrawler> void start(Class<T> _c, int numberOfCrawlers) { try { crawlersLocalData.clear(); threads = new ArrayList<Thread>(); List<T> crawlers = new ArrayList<T>(); int numberofCrawlers = numberOfCrawlers; for (int i = 1; i <= numberofCrawlers; i++) { T crawler = _c.newInstance(); Thread thread = new Thread(crawler, "Crawler " + i); logger.info("Thread state1 = " + thread.getState().toString()); crawler.setThread(thread); crawler.setMyId(i); crawler.setMyController(this); thread.start(); logger.info("Thread state2 = " + thread.getState().toString()); crawlers.add(crawler); threads.add(thread); logger.info("Crawler " + i + " started."); } while (true) { sleep(DELAY); boolean someoneIsWorking = false; for (int i = 0; i < threads.size(); i++) { Thread thread = threads.get(i); if (!thread.isAlive()) { recreateThread(_c, crawlers, i); } else if (thread.getState() == State.RUNNABLE) { someoneIsWorking = true; logger.info("Thread " + i + " was RUNNABLE."); } else if (thread.getState() == State.WAITING) { logger.info("Thread " + i + " was WAITING."); // thread.interrupt(); // thread.join(); } else { logger.info("Thread " + i + thread.getState().toString()); // recreateThread(_c, crawlers, i); } } if (!someoneIsWorking) { // Make sure again that none of the threads are alive. logger.info("It looks like no thread is working, waiting for 20 second to make sure..."); sleep(DELAY); if (!isAnyThreadWorking()) { long queueLength = Frontier.getQueueLength(); if (queueLength > 0) { continue; } logger.info( "No thread is working and no more URLs are in queue waiting for another 20 second to make sure..."); sleep(DELAY); queueLength = Frontier.getQueueLength(); if (queueLength > 0) { continue; } logger.info("All of the crawlers are stopped. Finishing the process..."); for (T crawler : crawlers) { crawler.onBeforeExit(); crawlersLocalData.add(crawler.getMyLocalData()); } // At this step, frontier notifies the threads that were waiting for new URLs and they should // stop // We will wait a few seconds for them and then return. Frontier.finish(); logger.info("Waiting for 1 second before final clean up..."); sleep(DELAY); try { Frontier.close(); env.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } /* * for (int i = 0; i < threads.size(); i++) { Thread thread = threads.get(i); * logger.info("Thread state = " + thread.getState().toString()); if (thread.isAlive()) { * logger.info("Wait for live thread to die"); thread.join(); } * * } */ // PageFetcher.stopConnectionMonitorThread(); return; } } } } catch (Exception e) { e.printStackTrace(); } } /** * Recreate thread. * * @param <T> * the generic type * @param _c * the _c * @param crawlers * the crawlers * @param i * the i * @throws InstantiationException * the instantiation exception * @throws IllegalAccessException * the illegal access exception */ private <T extends WebCrawler> void recreateThread(Class<T> _c, List<T> crawlers, int i) throws InstantiationException, IllegalAccessException { Thread thread; logger.info("Thread " + i + " was dead, I'll recreate it."); T crawler = _c.newInstance(); thread = new Thread(crawler, "Crawler " + (i + 1)); logger.info("Thread state3 = " + thread.getState().toString()); threads.remove(i); threads.add(i, thread); crawler.setThread(thread); crawler.setMyId(i + 1); crawler.setMyController(this); thread.start(); logger.info("Thread state4 = " + thread.getState().toString()); crawlers.remove(i); crawlers.add(i, crawler); } /** * Sleep. * * @param seconds * the seconds */ private void sleep(int seconds) { try { Thread.sleep(seconds * 1000); } catch (InterruptedException e) { // We've been interrupted: no more messages. return; } } /** * Checks if is any thread working. * * @return true, if is any thread working */ private boolean isAnyThreadWorking() { boolean someoneIsWorking = false; for (int i = 0; i < threads.size(); i++) { Thread thread = threads.get(i); if (thread.isAlive() && thread.getState() == State.RUNNABLE) { someoneIsWorking = true; } } return someoneIsWorking; } /** * Adds the seed. * * @param pageUrl * the page url */ public void addSeed(String pageUrl) { String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl); if (canonicalUrl == null) { logger.error("Invalid seed URL: " + pageUrl); return; } int docid = DocIDServer.getDocID(canonicalUrl); if (docid > 0) { // This URL is already seen. return; } WebURL webUrl = new WebURL(); webUrl.setURL(canonicalUrl); docid = DocIDServer.getNewDocID(canonicalUrl); webUrl.setDocid(docid); webUrl.setDepth((short) 0); if (!RobotstxtServer.allows(webUrl)) { logger.info("Robots.txt does not allow this seed: " + pageUrl); } else { Frontier.schedule(webUrl); } saveBaseUrl(pageUrl); } /** * Save base url. * * @param pageUrl * the page url */ private void saveBaseUrl(String pageUrl) { int lastPos = pageUrl.indexOf("//") + 2; lastPos = pageUrl.indexOf("/", lastPos); if (lastPos == -1) { baseUrl = pageUrl; } else { baseUrl = pageUrl.substring(0, lastPos); int pos = baseUrl.indexOf("."); if (pos > 0) { baseUrl = baseUrl.substring(pos + 1, baseUrl.length()); } } logger.info("baseUrl =: " + baseUrl); } /** * Sets the politeness delay. * * @param milliseconds * the new politeness delay */ public void setPolitenessDelay(int milliseconds) { if (milliseconds < 0) { return; } if (milliseconds > 10000) { milliseconds = 10000; } PageFetcher.setPolitenessDelay(milliseconds); } /** * Sets the maximum crawl depth. * * @param depth * the new maximum crawl depth * @throws Exception * the exception */ public void setMaximumCrawlDepth(int depth) throws Exception { if (depth < -1) { throw new Exception( "Maximum crawl depth should be either a positive number or -1 for unlimited depth."); } if (depth > Short.MAX_VALUE) { throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE); } WebCrawler.setMaximumCrawlDepth((short) depth); } /** * Sets the maximum pages to fetch. * * @param max * the new maximum pages to fetch */ public void setMaximumPagesToFetch(int max) { Frontier.setMaximumPagesToFetch(max); } /** * Sets the proxy. * * @param proxyHost * the proxy host * @param proxyPort * the proxy port */ public void setProxy(String proxyHost, int proxyPort) { PageFetcher.setProxy(proxyHost, proxyPort); } /** * Sets the proxy. * * @param proxyHost * the proxy host * @param proxyPort * the proxy port * @param username * the username * @param password * the password */ public static void setProxy(String proxyHost, int proxyPort, String username, String password) { PageFetcher.setProxy(proxyHost, proxyPort, username, password); } /** * Gets the base url. * * @return the base url */ public String getBaseUrl() { return baseUrl; } }