Java tutorial
/* * Copyright (C) 2015 hu * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 109 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package cn.weibo.webcollector.spider; import cn.edu.hfut.dmic.webcollector.example.WeiboCN; import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.net.HttpRequest; import cn.edu.hfut.dmic.webcollector.net.HttpResponse; import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; import cn.edu.hfut.dmic.webcollector.plugin.mongo.MongoCrawler; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.mongodb.client.MongoDatabase; import com.mongodb.DB; import com.mongodb.MongoClient; import com.mongodb.client.MongoCollection; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.bson.Document; import java.util.ArrayList; import java.util.List; import java.util.Set; /** * WebCollector?cookie???? * @author hu */ public class WeiboCrawler extends BreadthCrawler { String cookie; public WeiboCrawler(String crawlPath, boolean autoParse) throws Exception { super(crawlPath, autoParse); /*??cookie?????*/ cookie = WeiboCN.getSinaCookie("eyuhn2000@163.com", "xxnda2011"); } @Override public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception { HttpRequest request = new HttpRequest(crawlDatum); request.setCookie(cookie); return request.getResponse(); } @Override public void visit(Page page, CrawlDatums next) { String inlink = page.meta("inlink"); String title = page.select("title").text(); String url = page.getUrl(); /*??*/ Elements weibos = page.select("div[id].c"); try { MongoClient mongoClient = new MongoClient("localhost", 27017); Logger mongoLogger = Logger.getLogger("org.mongodb.driver"); mongoLogger.setLevel(Level.ERROR); // ? MongoDatabase mongoDatabase = mongoClient.getDatabase("weibo_crawler"); System.out.println("Connect to database successfully"); MongoCollection<Document> collection = mongoDatabase.getCollection("weibo_page"); //? /** * 1. org.bson.Document ?key-value? * 2. ?List<Document> * 3. ???? mongoCollection.insertMany(List<Document>) ??? mongoCollection.insertOne(Document) * */ for (Element weibo : weibos) { if (weibo.text().length() != 0) { Document document = new Document("url", url).append("title", title) .append("content", weibo.text()).append("inlink", inlink); List<Document> documents = new ArrayList<Document>(); documents.add(document); collection.insertMany(documents); } } System.out.println("??"); mongoClient.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); } } public static void main(String[] args) throws Exception { MongoClient mongoClient = new MongoClient("127.0.0.1", 27017); Logger mongoLogger = Logger.getLogger("org.mongodb.driver"); mongoLogger.setLevel(Level.ERROR); // ? // DBCollection dbCollection = // mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("weibo_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("weibo_page")) { db.getCollection(s).drop(); } } WeiboCrawler crawler = new WeiboCrawler("weibo_crawler", true); crawler.setThreads(3); /*???10?*/ for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/entpaparazzi?vt=4&page=" + i) //? .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/dianyingpiaofangba?vt=4&page=" + i) //? .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/houson100037?vt=4&page=" + i) //Houson .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/kaopuyingping?vt=4&page=" + i) //? .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/rottentomato?vt=4&page=" + i) // .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/cfcu?vt=4&page=" + i) //? .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/moviefactory?vt=4&page=" + i) // .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/wodianying?vt=4&page=" + i) //Mr .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/movietheworld?vt=4&page=" + i) //? .meta("inlink", "seed").meta("depth", "1")); } for (int i = 1; i <= 10; i++) { crawler.addSeed(new CrawlDatum("http://weibo.cn/badmovie?vt=4&page=" + i) // .meta("inlink", "seed").meta("depth", "1")); } crawler.addRegex("-.*\\.(jpg|png|gif).*"); crawler.addRegex("-.*top.*"); crawler.addRegex("http://weibo.cn/badmovie.*"); crawler.addRegex("http://weibo.cn/movietheworld.*"); crawler.addRegex("http://weibo.cn/wodianying.*"); crawler.addRegex("http://weibo.cn/moviefactory.*"); crawler.addRegex("http://weibo.cn/cfcu.*"); crawler.addRegex("http://weibo.cn/rottentomato.*"); crawler.addRegex("http://weibo.cn/kaopuyingping.*"); crawler.addRegex("http://weibo.cn/houson100037.*"); crawler.addRegex("http://weibo.cn/dianyingpiaofangba.*"); crawler.addRegex("http://weibo.cn/entpaparazzi.*"); crawler.addRegex("http://weibo.cn/comment/.*"); crawler.start(1); } }