Java tutorial
/* * Copyright (C) 2014 hu * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package com.dict.crawl; import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler; import cn.edu.hfut.dmic.webcollector.model.Links; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl; import cn.edu.hfut.dmic.webcollector.util.Config; import cn.edu.hfut.dmic.webcollector.util.RegexRule; import com.dict.util.AntiAntiSpiderHelper; import com.dict.util.Configuration; import com.dict.util.JDBCHelper; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonParser; import lombok.extern.apachecommons.CommonsLog; import org.springframework.jdbc.core.JdbcTemplate; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.HttpURLConnection; import java.net.URL; /** * WebCollector 2.xtutorial * 2.x * 1?????????AJAX * 2Berkeley DB?URL??? * 3?selenium?javascript??? * 4???? * 5?spring jdbcmysql connection?? * 6?json? * 7slf4j? * 8http?http * <p/> * ?cn.edu.hfut.dmic.webcollector.example?(Demo) * * @author hu */ @CommonsLog public class NewsNationalGeographicCrawler extends DeepCrawler { RegexRule regexRule = new RegexRule(); JdbcTemplate jdbcTemplate = null; public NewsNationalGeographicCrawler(String crawlPath) { super(crawlPath); regexRule.addRule("http://news.nationalgeographic.com/.*"); regexRule.addRule("http://ngm.nationalgeographic.com/.*"); // regexRule.addRule("http://.*.nationalgeographic.com/.*"); regexRule.addRule("-.*jpg.*"); /*JdbcTemplate,"mysql1"???? JDBCHelper.getJdbcTemplate("mysql1")?? ????URL???????? JdbcTemplate??? JdbcTemplate(?JDBCHelper.getJdbcTemplate("??") ??JdbcTemplate) */ try { Configuration conf = new Configuration("conf/remote.properties"); jdbcTemplate = JDBCHelper.createMysqlTemplate("mysql1", conf.get(Configuration.MYSQL_URL), conf.get(Configuration.MYSQL_USER), conf.get(Configuration.MYSQL_PASSWORD), 5, 30); } catch (Exception ex) { jdbcTemplate = null; System.out.println("mysql?JDBCHelper.createMysqlTemplate???!"); } } @Override public Links visitAndGetNextLinks(Page page) { try { BaseExtractor extractor = new NewsNationalGeographicExtractor(page); if (extractor.extractor() && jdbcTemplate != null) { extractor.insertWith(jdbcTemplate); // // ParserPage p = extractor.getParserPage(); // int updates = jdbcTemplate.update("insert ignore into parser_page (title, type, label, level, style, host, url, time, description, content, wordCount, version, mainimage, moreinfo) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?)", // p.getTitle(), p.getType(), p.getLabel(), p.getLevel(), p.getStyle(), p.getHost(), p.getUrl(), p.getTime(), p.getDescription(), p.getContent(), p.getWordCount(), p.getVersion(), p.getMainimage(), p.getMoreinfo()); //// int updates = jdbcTemplate.update("update parser_page set content = ?, mainimage = ?, style = ? where url = ?", p.getContent(), p.getMainimage(), p.getStyle(), p.getUrl()); // // if (updates == 1) { // System.out.println("parser_page??"); // int id = jdbcTemplate.queryForInt("SELECT id FROM parser_page WHERE url = ?", p.getUrl()); // // updates = jdbcTemplate.update("insert ignore into org_content (id, content) values (?,?)", // id, extractor.doc.html()); // System.out.println("org_content??"); // }else{ // System.out.println("mysql???updates" + updates); // } } } catch (Exception e) { e.printStackTrace(); } // log.info("after insert"); /*?2.0*/ /*?page?? ?URL???URL*/ Links nextLinks = new Links(); /*???URL Links.addAllFromDocument??*/ nextLinks.addAllFromDocument(page.getDoc(), regexRule); /*LinksArrayList<String>,?add?addAllURL ?????return null ???seed?return null */ // log.info("Before return nextLinks"); return nextLinks; } public static void main(String[] args) throws Exception { /*string,crawlPath??crawlPath, ????crawlPath */ NewsNationalGeographicCrawler crawler = new NewsNationalGeographicCrawler("data/NewsNationalGeographic"); crawler.setThreads(2); crawler.addSeed("http://ngm.nationalgeographic.com/"); if (BaseCrawler.isNormalTime()) { crawler.addSeed("http://ngm.nationalgeographic.com/archives"); crawler.addSeed("http://ngm.nationalgeographic.com/featurehub"); // //} String jsonUrl = "http://news.nationalgeographic.com/bin/services/news/public/query/content.json?pageSize=20&page=0&contentTypes=news/components/pagetypes/article,news/components/pagetypes/simple-article,news/components/pagetypes/photo-gallery"; URL urls = new URL(jsonUrl); HttpURLConnection urlConnection = (HttpURLConnection) urls.openConnection(); InputStream is = urlConnection.getInputStream(); Reader rd = new InputStreamReader(is, "utf-8"); JsonArray json = new JsonParser().parse(rd).getAsJsonArray(); for (JsonElement jOb : json) { String url = jOb.getAsJsonObject().get("page").getAsJsonObject().get("url").getAsString(); if (url != null && !url.equals("")) crawler.addSeed(url); } } // crawler.addSeed("http://news.nationalgeographic.com/2016/01/160118-mummies-world-bog-egypt-science/"); // List<Map<String, Object>> urls = crawler.jdbcTemplate.queryForList("SELECT id,title,url FROM parser_page where host like '%news.national%' ORDER by id desc;"); // for(int i = 0; i < urls.size(); i++) { // String url = (String) urls.get(i).get("url"); // String title = (String) urls.get(i).get("title"); //// int id = (int) urls.get(i).get("id"); // crawler.addSeed(url); // } // Config Config.WAIT_THREAD_END_TIME = 1000 * 60 * 5;//???kill // Config.TIMEOUT_CONNECT = 1000*10; // Config.TIMEOUT_READ = 1000*30; Config.requestMaxInterval = 1000 * 60 * 20;//??-?>hung //requester??http??requester?http/socks? HttpRequesterImpl requester = (HttpRequesterImpl) crawler.getHttpRequester(); AntiAntiSpiderHelper.defaultUserAgent(requester); // requester.setUserAgent("Mozilla/5.0 (X11; Linux i686; rv:33.0) Gecko/20100101 Firefox/33.0"); // requester.setCookie("CNZZDATA1950488=cnzz_eid%3D739324831-1432460954-null%26ntime%3D1432460954; wdcid=44349d3f2aa96e51; vjuids=-53d395da8.14eca7eed44.0.f17be67e; CNZZDATA3473518=cnzz_eid%3D1882396923-1437965756-%26ntime%3D1440635510; pt_37a49e8b=uid=FuI4KYEfVz5xq7L4nzPd1w&nid=1&vid=r4AhSBmxisCiyeolr3V2Ow&vn=1&pvn=1&sact=1440639037916&to_flag=0&pl=t4NrgYqSK5M357L2nGEQCw*pt*1440639015734; _ga=GA1.3.1121158748.1437970841; __auc=c00a6ac114d85945f01d9c30128; CNZZDATA1975683=cnzz_eid%3D250014133-1432460541-null%26ntime%3D1440733997; CNZZDATA1254041250=2000695407-1442220871-%7C1442306691; pt_7f0a67e8=uid=6lmgYeZ3/jSObRMeK-t27A&nid=0&vid=lEKvEtZyZdd0UC264UyZnQ&vn=2&pvn=1&sact=1442306703728&to_flag=0&pl=7GB3sYS/PJDo1mY0qeu2cA*pt*1442306703728; 7NSx_98ef_saltkey=P05gN8zn; 7NSx_98ef_lastvisit=1444281282; IframeBodyHeight=256; NTVq_98ef_saltkey=j5PydYru; NTVq_98ef_lastvisit=1444282735; NTVq_98ef_atarget=1; NTVq_98ef_lastact=1444286377%09api.php%09js; 7NSx_98ef_sid=hZyDwc; __utmt=1; __utma=155578217.1121158748.1437970841.1443159326.1444285109.23; __utmb=155578217.57.10.1444285109; __utmc=155578217; __utmz=155578217.1439345650.3.2.utmcsr=travel.chinadaily.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/; CNZZDATA3089622=cnzz_eid%3D1722311508-1437912344-%26ntime%3D1444286009; wdlast=1444287704; vjlast=1437916393.1444285111.11; 7NSx_98ef_lastact=1444287477%09api.php%09chinadaily; pt_s_3bfec6ad=vt=1444287704638&cad=; pt_3bfec6ad=uid=bo87MAT/HC3hy12HDkBg1A&nid=0&vid=erwHQyFKxvwHXYc4-r6n-w&vn=28&pvn=2&sact=1444287708079&to_flag=0&pl=kkgvLoEHXsCD2gs4VJaWQg*pt*1444287704638; pt_t_3bfec6ad=?id=3bfec6ad.bo87MAT/HC3hy12HDkBg1A.erwHQyFKxvwHXYc4-r6n-w.kkgvLoEHXsCD2gs4VJaWQg.nZJ9Aj/bgfNDIKBXI5TwRQ&stat=167.132.1050.1076.1body%20div%3Aeq%288%29%20ul%3Aeq%280%29%20a%3Aeq%282%29.0.0.1595.3441.146.118&ptif=4"); //?? Mozilla/5.0 (X11; Linux i686; rv:34.0) Gecko/20100101 Firefox/34.0 //c requester.setProxy(" /* //?? RandomProxyGenerator proxyGenerator=new RandomProxyGenerator(); proxyGenerator.addProxy("127.0.0.1",8080,Proxy.Type.SOCKS); requester.setProxyGenerator(proxyGenerator); */ /*??*/ // crawler.setResumable(true); crawler.setResumable(false); crawler.start(2); } }