edu.uci.ics.crawler4j.weatherCrawler.BasicCrawlController.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.crawler4j.weatherCrawler.BasicCrawlController.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.uci.ics.crawler4j.weatherCrawler;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.RandomUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;

import edu.uci.ics.crawler4j.bean.ProxySetting;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.util.ConfigUtils;
import edu.uci.ics.crawler4j.util.IO;
import edu.uci.ics.crawler4j.util.TestProxy;

/**
 * @author Yasser Ganjisaffar <lastname at gmail dot com>
 */
public class BasicCrawlController {

    public static void main(String[] args) {
        String folder = ConfigUtils.getFolder();
        String crawlerCount = ConfigUtils.getCrawlerCount();
        args = new String[2];
        if (StringUtils.isBlank(folder) || StringUtils.isBlank(crawlerCount)) {
            args[0] = "weather";
            args[1] = "10";
            System.out.println("No parameters in config.properties .......");
            System.out.println("[weather] will be used as rootFolder (it will contain intermediate crawl data)");
            System.out.println("[10] will be used as numberOfCralwers (number of concurrent threads)");
        } else {

            args[0] = folder;
            args[1] = crawlerCount;
        }

        /*
         * crawlStorageFolder is a folder where intermediate crawl data is
         * stored.
         */
        String crawlStorageFolder = args[0];

        /*
         * numberOfCrawlers shows the number of concurrent threads that should
         * be initiated for crawling.
         */
        int numberOfCrawlers = Integer.parseInt(args[1]);

        CrawlConfig config = new CrawlConfig();

        if (crawlStorageFolder != null && IO.deleteFolderContents(new File(crawlStorageFolder)))
            System.out.println("");
        config.setCrawlStorageFolder(crawlStorageFolder + "/d" + System.currentTimeMillis());

        /*
         * Be polite: Make sure that we don't send more than 1 request per
         * second (1000 milliseconds between requests).
         */
        config.setPolitenessDelay(1000);

        config.setConnectionTimeout(1000 * 60);
        // config1.setPolitenessDelay(1000);

        /*
         * You can set the maximum crawl depth here. The default value is -1 for
         * unlimited depth
         */
        config.setMaxDepthOfCrawling(StringUtils.isBlank(ConfigUtils.getCrawlerDepth()) ? 40
                : Integer.valueOf(ConfigUtils.getCrawlerDepth()));
        // config1.setMaxDepthOfCrawling(0);

        /*
         * You can set the maximum number of pages to crawl. The default value
         * is -1 for unlimited number of pages
         */
        config.setMaxPagesToFetch(100000);
        // config1.setMaxPagesToFetch(10000);

        /*
         * Do you need to set a proxy? If so, you can use:
         * config.setProxyHost("proxyserver.example.com");
         * config.setProxyPort(8080);
         * 
         * If your proxy also needs authentication:
         * config.setProxyUsername(username); config.getProxyPassword(password);
         */

        if (ConfigUtils.getValue("useProxy", "false").equalsIgnoreCase("true")) {

            System.out.println("?============");
            List<ProxySetting> proxys = ConfigUtils.getProxyList();

            ProxySetting proxy = proxys.get(RandomUtils.nextInt(proxys.size() - 1));

            /* test the proxy is alaviable or not */
            while (!TestProxy.testProxyAvailable(proxy)) {
                proxy = proxys.get(RandomUtils.nextInt(proxys.size() - 1));
            }
            System.out.println("??" + proxy.getIp() + ":" + proxy.getPort());
            config.setProxyHost(proxy.getIp());
            config.setProxyPort(proxy.getPort());
            //      config.setProxyHost("127.0.0.1");
            //      config.setProxyPort(8087);
        } else {
            System.out.println("??============");
        }

        /*
         * This config parameter can be used to set your crawl to be resumable
         * (meaning that you can resume the crawl from a previously
         * interrupted/crashed crawl). Note: if you enable resuming feature and
         * want to start a fresh crawl, you need to delete the contents of
         * rootFolder manually.
         */
        config.setResumableCrawling(false);
        // config1.setResumableCrawling(false);
        /*
         * Instantiate the controller for this crawl.
         */
        PageFetcher pageFetcher = new PageFetcher(config);
        // PageFetcher pageFetcher1 = new PageFetcher(config1);

        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

        try {

            /*
             * For each crawl, you need to add some seed urls. These are the
             * first URLs that are fetched and then the crawler starts following
             * links which are found in these pages
             */
            CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

            controller.addSeed(StringUtils.isBlank(ConfigUtils.getSeed()) ? "http://www.tianqi.com/chinacity.html"
                    : ConfigUtils.getSeed());

            // controller.addSeed("http://www.ics.uci.edu/~lopes/");
            // controller.addSeed("http://www.ics.uci.edu/~welling/");

            /*
             * Start the crawl. This is a blocking operation, meaning that your
             * code will reach the line after this only when crawling is
             * finished.
             */

            String isDaily = null;

            isDaily = ConfigUtils.getValue("isDaily", "true");

            System.out
                    .println("?=======" + ConfigUtils.getValue("table", "weather_data") + "=======");

            if (isDaily.equalsIgnoreCase("true")) {
                System.out.println("???==============");
                controller.start(BasicDailyCrawler.class, numberOfCrawlers);
            } else {
                System.out.println("???==============");
                controller.start(BasicCrawler.class, numberOfCrawlers);
            }

        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}