com.dict.crawl.NewsNationalGeographicCrawler.java Source code

Introduction

Here is the source code for com.dict.crawl.NewsNationalGeographicCrawler.java
Source

/*
 * Copyright (C) 2014 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package com.dict.crawl;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;
import com.dict.util.AntiAntiSpiderHelper;
import com.dict.util.Configuration;
import com.dict.util.JDBCHelper;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import lombok.extern.apachecommons.CommonsLog;
import org.springframework.jdbc.core.JdbcTemplate;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * WebCollector 2.xtutorial
 * 2.x
 * 1?????????AJAX
 * 2Berkeley DB?URL???
 * 3?selenium?javascript???
 * 4????
 * 5?spring jdbcmysql connection??
 * 6?json?
 * 7slf4j?
 * 8http?http
 * <p/>
 * ?cn.edu.hfut.dmic.webcollector.example?(Demo)
 *
 * @author hu
 */
@CommonsLog
public class NewsNationalGeographicCrawler extends DeepCrawler {

    RegexRule regexRule = new RegexRule();

    JdbcTemplate jdbcTemplate = null;

    public NewsNationalGeographicCrawler(String crawlPath) {
        super(crawlPath);

        regexRule.addRule("http://news.nationalgeographic.com/.*");
        regexRule.addRule("http://ngm.nationalgeographic.com/.*");
        //        regexRule.addRule("http://.*.nationalgeographic.com/.*");
        regexRule.addRule("-.*jpg.*");

        /*JdbcTemplate,"mysql1"????
         JDBCHelper.getJdbcTemplate("mysql1")??
         ????URL????????
            
         JdbcTemplate???
         JdbcTemplate(?JDBCHelper.getJdbcTemplate("??")
         ??JdbcTemplate)             
         */

        try {

            Configuration conf = new Configuration("conf/remote.properties");
            jdbcTemplate = JDBCHelper.createMysqlTemplate("mysql1", conf.get(Configuration.MYSQL_URL),
                    conf.get(Configuration.MYSQL_USER), conf.get(Configuration.MYSQL_PASSWORD), 5, 30);
        } catch (Exception ex) {
            jdbcTemplate = null;
            System.out.println("mysql?JDBCHelper.createMysqlTemplate???!");
        }
    }

    @Override
    public Links visitAndGetNextLinks(Page page) {
        try {
            BaseExtractor extractor = new NewsNationalGeographicExtractor(page);
            if (extractor.extractor() && jdbcTemplate != null) {
                extractor.insertWith(jdbcTemplate);
                //
                //                ParserPage p = extractor.getParserPage();
                //                int updates = jdbcTemplate.update("insert ignore into parser_page (title, type, label, level, style, host, url, time, description, content, wordCount, version, mainimage, moreinfo) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                //                        p.getTitle(), p.getType(), p.getLabel(), p.getLevel(), p.getStyle(), p.getHost(), p.getUrl(), p.getTime(), p.getDescription(), p.getContent(), p.getWordCount(), p.getVersion(), p.getMainimage(), p.getMoreinfo());
                ////                int updates = jdbcTemplate.update("update parser_page set content = ?, mainimage = ?, style = ? where url = ?", p.getContent(), p.getMainimage(), p.getStyle(), p.getUrl());
                //
                //                if (updates == 1) {
                //                    System.out.println("parser_page??");
                //                    int id = jdbcTemplate.queryForInt("SELECT id FROM parser_page WHERE url = ?", p.getUrl());
                //
                //                    updates = jdbcTemplate.update("insert ignore into org_content (id, content) values (?,?)",
                //                            id, extractor.doc.html());
                //                    System.out.println("org_content??");
                //                }else{
                //                    System.out.println("mysql???updates" + updates);
                //                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        //        log.info("after insert");
        /*?2.0*/
        /*?page??
         ?URL???URL*/
        Links nextLinks = new Links();

        /*???URL
         Links.addAllFromDocument??*/
        nextLinks.addAllFromDocument(page.getDoc(), regexRule);

        /*LinksArrayList<String>,?add?addAllURL
         ?????return null
         ???seed?return null
         */
        //        log.info("Before return nextLinks");
        return nextLinks;
    }

    public static void main(String[] args) throws Exception {
        /*string,crawlPath??crawlPath,
          ????crawlPath
        */

        NewsNationalGeographicCrawler crawler = new NewsNationalGeographicCrawler("data/NewsNationalGeographic");
        crawler.setThreads(2);
        crawler.addSeed("http://ngm.nationalgeographic.com/");

        if (BaseCrawler.isNormalTime()) {

            crawler.addSeed("http://ngm.nationalgeographic.com/archives");
            crawler.addSeed("http://ngm.nationalgeographic.com/featurehub");
            //
            //}

            String jsonUrl = "http://news.nationalgeographic.com/bin/services/news/public/query/content.json?pageSize=20&page=0&contentTypes=news/components/pagetypes/article,news/components/pagetypes/simple-article,news/components/pagetypes/photo-gallery";

            URL urls = new URL(jsonUrl);
            HttpURLConnection urlConnection = (HttpURLConnection) urls.openConnection();
            InputStream is = urlConnection.getInputStream();
            Reader rd = new InputStreamReader(is, "utf-8");
            JsonArray json = new JsonParser().parse(rd).getAsJsonArray();
            for (JsonElement jOb : json) {
                String url = jOb.getAsJsonObject().get("page").getAsJsonObject().get("url").getAsString();
                if (url != null && !url.equals(""))
                    crawler.addSeed(url);
            }

        }

        //        crawler.addSeed("http://news.nationalgeographic.com/2016/01/160118-mummies-world-bog-egypt-science/");
        //        List<Map<String, Object>> urls = crawler.jdbcTemplate.queryForList("SELECT id,title,url FROM parser_page where host like '%news.national%' ORDER by id desc;");
        //        for(int i = 0; i < urls.size(); i++) {
        //            String url = (String) urls.get(i).get("url");
        //            String title = (String) urls.get(i).get("title");
        ////            int id = (int) urls.get(i).get("id");
        //            crawler.addSeed(url);
        //        }

        //        Config
        Config.WAIT_THREAD_END_TIME = 1000 * 60 * 5;//???kill
        //        Config.TIMEOUT_CONNECT = 1000*10;
        //        Config.TIMEOUT_READ = 1000*30;
        Config.requestMaxInterval = 1000 * 60 * 20;//??-?>hung

        //requester??http??requester?http/socks?
        HttpRequesterImpl requester = (HttpRequesterImpl) crawler.getHttpRequester();
        AntiAntiSpiderHelper.defaultUserAgent(requester);

        //        requester.setUserAgent("Mozilla/5.0 (X11; Linux i686; rv:33.0) Gecko/20100101 Firefox/33.0");
        //        requester.setCookie("CNZZDATA1950488=cnzz_eid%3D739324831-1432460954-null%26ntime%3D1432460954; wdcid=44349d3f2aa96e51; vjuids=-53d395da8.14eca7eed44.0.f17be67e; CNZZDATA3473518=cnzz_eid%3D1882396923-1437965756-%26ntime%3D1440635510; pt_37a49e8b=uid=FuI4KYEfVz5xq7L4nzPd1w&nid=1&vid=r4AhSBmxisCiyeolr3V2Ow&vn=1&pvn=1&sact=1440639037916&to_flag=0&pl=t4NrgYqSK5M357L2nGEQCw*pt*1440639015734; _ga=GA1.3.1121158748.1437970841; __auc=c00a6ac114d85945f01d9c30128; CNZZDATA1975683=cnzz_eid%3D250014133-1432460541-null%26ntime%3D1440733997; CNZZDATA1254041250=2000695407-1442220871-%7C1442306691; pt_7f0a67e8=uid=6lmgYeZ3/jSObRMeK-t27A&nid=0&vid=lEKvEtZyZdd0UC264UyZnQ&vn=2&pvn=1&sact=1442306703728&to_flag=0&pl=7GB3sYS/PJDo1mY0qeu2cA*pt*1442306703728; 7NSx_98ef_saltkey=P05gN8zn; 7NSx_98ef_lastvisit=1444281282; IframeBodyHeight=256; NTVq_98ef_saltkey=j5PydYru; NTVq_98ef_lastvisit=1444282735; NTVq_98ef_atarget=1; NTVq_98ef_lastact=1444286377%09api.php%09js; 7NSx_98ef_sid=hZyDwc; __utmt=1; __utma=155578217.1121158748.1437970841.1443159326.1444285109.23; __utmb=155578217.57.10.1444285109; __utmc=155578217; __utmz=155578217.1439345650.3.2.utmcsr=travel.chinadaily.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/; CNZZDATA3089622=cnzz_eid%3D1722311508-1437912344-%26ntime%3D1444286009; wdlast=1444287704; vjlast=1437916393.1444285111.11; 7NSx_98ef_lastact=1444287477%09api.php%09chinadaily; pt_s_3bfec6ad=vt=1444287704638&cad=; pt_3bfec6ad=uid=bo87MAT/HC3hy12HDkBg1A&nid=0&vid=erwHQyFKxvwHXYc4-r6n-w&vn=28&pvn=2&sact=1444287708079&to_flag=0&pl=kkgvLoEHXsCD2gs4VJaWQg*pt*1444287704638; pt_t_3bfec6ad=?id=3bfec6ad.bo87MAT/HC3hy12HDkBg1A.erwHQyFKxvwHXYc4-r6n-w.kkgvLoEHXsCD2gs4VJaWQg.nZJ9Aj/bgfNDIKBXI5TwRQ&stat=167.132.1050.1076.1body%20div%3Aeq%288%29%20ul%3Aeq%280%29%20a%3Aeq%282%29.0.0.1595.3441.146.118&ptif=4");
        //?? Mozilla/5.0 (X11; Linux i686; rv:34.0) Gecko/20100101 Firefox/34.0
        //c requester.setProxy("
        /*
            
        //??
        RandomProxyGenerator proxyGenerator=new RandomProxyGenerator();
        proxyGenerator.addProxy("127.0.0.1",8080,Proxy.Type.SOCKS);
        requester.setProxyGenerator(proxyGenerator);
        */

        /*??*/
        //        crawler.setResumable(true);
        crawler.setResumable(false);

        crawler.start(2);
    }

}