Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.weatherCrawler; import java.io.File; import java.io.IOException; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.RandomUtils; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.impl.client.DefaultHttpClient; import edu.uci.ics.crawler4j.bean.ProxySetting; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import edu.uci.ics.crawler4j.util.ConfigUtils; import edu.uci.ics.crawler4j.util.IO; import edu.uci.ics.crawler4j.util.TestProxy; /** * @author Yasser Ganjisaffar <lastname at gmail dot com> */ public class BasicCrawlController { public static void main(String[] args) { String folder = ConfigUtils.getFolder(); String crawlerCount = ConfigUtils.getCrawlerCount(); args = new String[2]; if (StringUtils.isBlank(folder) || StringUtils.isBlank(crawlerCount)) { args[0] = "weather"; args[1] = "10"; System.out.println("No parameters in config.properties ......."); System.out.println("[weather] will be used as rootFolder (it will contain intermediate crawl data)"); System.out.println("[10] will be used as numberOfCralwers (number of concurrent threads)"); } else { args[0] = folder; args[1] = crawlerCount; } /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = args[0]; /* * numberOfCrawlers shows the number of concurrent threads that should * be initiated for crawling. */ int numberOfCrawlers = Integer.parseInt(args[1]); CrawlConfig config = new CrawlConfig(); if (crawlStorageFolder != null && IO.deleteFolderContents(new File(crawlStorageFolder))) System.out.println(""); config.setCrawlStorageFolder(crawlStorageFolder + "/d" + System.currentTimeMillis()); /* * Be polite: Make sure that we don't send more than 1 request per * second (1000 milliseconds between requests). */ config.setPolitenessDelay(1000); config.setConnectionTimeout(1000 * 60); // config1.setPolitenessDelay(1000); /* * You can set the maximum crawl depth here. The default value is -1 for * unlimited depth */ config.setMaxDepthOfCrawling(StringUtils.isBlank(ConfigUtils.getCrawlerDepth()) ? 40 : Integer.valueOf(ConfigUtils.getCrawlerDepth())); // config1.setMaxDepthOfCrawling(0); /* * You can set the maximum number of pages to crawl. The default value * is -1 for unlimited number of pages */ config.setMaxPagesToFetch(100000); // config1.setMaxPagesToFetch(10000); /* * Do you need to set a proxy? If so, you can use: * config.setProxyHost("proxyserver.example.com"); * config.setProxyPort(8080); * * If your proxy also needs authentication: * config.setProxyUsername(username); config.getProxyPassword(password); */ if (ConfigUtils.getValue("useProxy", "false").equalsIgnoreCase("true")) { System.out.println("?============"); List<ProxySetting> proxys = ConfigUtils.getProxyList(); ProxySetting proxy = proxys.get(RandomUtils.nextInt(proxys.size() - 1)); /* test the proxy is alaviable or not */ while (!TestProxy.testProxyAvailable(proxy)) { proxy = proxys.get(RandomUtils.nextInt(proxys.size() - 1)); } System.out.println("??" + proxy.getIp() + ":" + proxy.getPort()); config.setProxyHost(proxy.getIp()); config.setProxyPort(proxy.getPort()); // config.setProxyHost("127.0.0.1"); // config.setProxyPort(8087); } else { System.out.println("??============"); } /* * This config parameter can be used to set your crawl to be resumable * (meaning that you can resume the crawl from a previously * interrupted/crashed crawl). Note: if you enable resuming feature and * want to start a fresh crawl, you need to delete the contents of * rootFolder manually. */ config.setResumableCrawling(false); // config1.setResumableCrawling(false); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); // PageFetcher pageFetcher1 = new PageFetcher(config1); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); try { /* * For each crawl, you need to add some seed urls. These are the * first URLs that are fetched and then the crawler starts following * links which are found in these pages */ CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed(StringUtils.isBlank(ConfigUtils.getSeed()) ? "http://www.tianqi.com/chinacity.html" : ConfigUtils.getSeed()); // controller.addSeed("http://www.ics.uci.edu/~lopes/"); // controller.addSeed("http://www.ics.uci.edu/~welling/"); /* * Start the crawl. This is a blocking operation, meaning that your * code will reach the line after this only when crawling is * finished. */ String isDaily = null; isDaily = ConfigUtils.getValue("isDaily", "true"); System.out .println("?=======" + ConfigUtils.getValue("table", "weather_data") + "======="); if (isDaily.equalsIgnoreCase("true")) { System.out.println("???=============="); controller.start(BasicDailyCrawler.class, numberOfCrawlers); } else { System.out.println("???=============="); controller.start(BasicCrawler.class, numberOfCrawlers); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }