List of usage examples for com.mongodb MongoClient MongoClient
public MongoClient(final MongoClientURI uri, final MongoDriverInformation mongoDriverInformation)
From source file:cloud.simple.RuleEngineApplication.java
@Bean public MongoDatabase dataSource() { String servers = env.getProperty("spring.data.mongodb.custom.service"); String databaseName = env.getProperty("spring.data.mongodb.database"); List<ServerAddress> seeds = new ArrayList<ServerAddress>(); String[] servers1 = servers.split(","); for (String server : servers1) { String[] server1 = server.split(":"); seeds.add(new ServerAddress(server1[0], Integer.parseInt(server1[1]))); }// w w w . j av a 2s. c om Builder builder = MongoClientOptions.builder(); builder.socketKeepAlive(true); builder.readPreference(ReadPreference.secondaryPreferred()); MongoClientOptions options = builder.build(); @SuppressWarnings("resource") MongoClient mongoClient = new MongoClient(seeds, options); return mongoClient.getDatabase(databaseName); }
From source file:cn.edu.hfut.dmic.webcollector.example.DemoSelenium.java
License:Open Source License
public static void main(String[] args) throws Exception { Executor executor = new Executor() { @Override//from w w w.j ava 2 s .co m public void execute(CrawlDatum datum, CrawlDatums next) throws Exception { MongoClient mongoClient = new MongoClient("localhost", 27017); // ? // DBCollection dbCollection = mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("attend_rate")) { db.getCollection(s).drop(); } } DBCollection dbCollection = db.getCollection("attend_rate"); HtmlUnitDriver driver = new HtmlUnitDriver(); driver.setJavascriptEnabled(false); driver.get(datum.getUrl()); System.out.println(driver.getPageSource()); WebElement click_view = driver.findElement(By.xpath("//div[@id='seatContent']//span[1]")); click_view.click(); String gold_seat = driver.getWindowHandle(); driver.switchTo().window(gold_seat); System.out.println(driver.getPageSource()); WebElement city_name = driver.findElement(By.xpath("//*[@id='all-citys']/div[1]/ul/li[1]/a")); System.out.println(city_name.getText()); WebElement element = driver.findElementByCssSelector("div#seat_table"); List<WebElement> movie_name = element.findElements(By.className("c1 lineDot")); List<WebElement> boxoffice_rate = element.findElements(By.className("c2 red")); List<WebElement> visit_pershow = element.findElements(By.className("c3 gray")); WebElement cityarea = driver.findElementByCssSelector("span[class='today']"); System.out.println(cityarea.getText()); for (int i = 0; i < movie_name.size(); i++) { System.out.println(movie_name.get(i).getText()); System.out.println(boxoffice_rate.get(i).getText()); System.out.println(visit_pershow.get(i).getText()); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", cityarea.getText()).append("movie_name", movie_name.get(i).getText()) .append("boxoffice_rate", boxoffice_rate.get(i).getText()) .append("visit_pershow", visit_pershow.get(i).getText()); dbCollection.insert(dbObject); } mongoClient.close(); } }; //DBDBManager DBManager manager = new BerkeleyDBManager("crawl"); //Crawler?DBManagerExecutor Crawler crawler = new Crawler(manager, executor); crawler.addSeed("http://pf.maoyan.com/attend/rate"); crawler.start(1); }
From source file:cn.edu.hfut.dmic.webcollector.example.Douban2Crawler.java
License:Open Source License
@Override public void visit(Page page, CrawlDatums next) { System.out.println(page.getUrl()); MongoClient mongoClient = new MongoClient("localhost", 27017); // ?/*ww w . j a va2s . c o m*/ MongoDatabase mongoDatabase = mongoClient.getDatabase("douban_crawler"); System.out.println("Connect to database successfully"); MongoCollection<org.bson.Document> collection = mongoDatabase.getCollection("moviereview"); List<Document> documents = new ArrayList<Document>(); if (page.matchUrl("https://movie.douban.com/review/\\d*/")) { index = index + 1; //String title = page.select("div[class=article_title]").first().text(); //String author = page.select("div[id=blog_userface]").first().text(); //System.out.println("title:" + title + "\tauthor:" + author); //HashMap md = page.getMetaData(); //String review = page.getMetaData("og:description").toString(); //Document doc = page.getDoc(); //System.out.println(doc); System.out.println("index:" + index); String review = page.select("div.main-bd").text(); System.out.println("review:" + review); String movieName = page.select("div.side-back").text(); System.out.println("movieName:" + movieName); ArrayList rc = page.select("div.comment-item"); Document document = new Document(); document.put("moviename", movieName); document.put("review", review); List<String> subreviews = new ArrayList<String>(); for (int i = 0; i < rc.size(); i++) { //System.out.println(rc.get(i).getClass()); //((org.jsoup.nodes.Element)rc.get(i)).getAllElements().get(3); org.jsoup.nodes.Element element = (org.jsoup.nodes.Element) ((org.jsoup.nodes.Element) ((org.jsoup.nodes.Element) rc .get(i)).childNodes().get(3)).childNode(3); //Object nodes = ((org.jsoup.nodes.Element)((org.jsoup.nodes.Element)rc.get(i)).childNodes()).getAllElements().get(3); String subReview = element.childNode(0).toString(); System.out.println("subReview:" + subReview); if (subReview != "") subreviews.add(subReview); } document.put("subreviews", subreviews); collection.insertOne(document); //documents.add(document); //collection.insertMany(documents); } else if (page.matchUrl("https://movie.douban.com/review/best/\\?start=\\d*")) { //Document doc = page.getDoc(); System.out.println("add seed"); //this.addSeed(page.getUrl()); } }
From source file:cn.edu.hfut.dmic.webcollector.example.FirefoxSelenium2.java
License:Open Source License
public static void main(String[] args) throws Exception { Executor executor = new Executor() { @Override// ww w . j a va 2s.co m public void execute(CrawlDatum datum, CrawlDatums next) throws Exception { MongoClient mongoClient = new MongoClient("localhost", 27017); // ? // DBCollection dbCollection = mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_am")) { db.getCollection(s).drop(); } } DBCollection dbCollection = db.getCollection("attend_rate"); ProfilesIni pi = new ProfilesIni(); FirefoxProfile profile = pi.getProfile("default"); WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38); // driver.setJavascriptEnabled(false); webClient.getOptions().setCssEnabled(true); HtmlPage page = webClient.getPage(datum.getUrl()); // System.out.println(driver.getPageSource()); // System.out.println(page.getByXPath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']/text()")); System.out.println(page.getByXPath("//span[@class='today']/em/text()")); System.out.println(page.getByXPath("//span[@class='today']/text()")); List<?> movie_name = page.getByXPath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']/text()"); List<?> boxoffice_rate = page.getByXPath("//div[@id='seat_table']//ul//li[@class='c2 red']/text()"); List<?> visit_pershow = page.getByXPath("//div[@id='seat_table']//ul//li[@class='c3 gray']/text()"); for (int i = 0; i < movie_name.size(); i++) { System.out.println(movie_name.get(i)); System.out.println(boxoffice_rate.get(i)); System.out.println(visit_pershow.get(i)); } // BasicDBObject dbObject = new BasicDBObject(); // dbObject.append("title", title).append("rank", amList.get(0)).append("mov_cnname", cn_name).append("mov_enname", en_name).append("toweek_rev", amList.get(2)).append("total_rev", amList.get(3)).append("val_week", amList.get(4)); // dbCollection.insert(dbObject); webClient.closeAllWindows(); } }; //DBDBManager DBManager manager = new BerkeleyDBManager("crawl"); //Crawler?DBManagerExecutor Crawler crawler = new Crawler(manager, executor); crawler.addSeed("http://pf.maoyan.com/attend/rate"); crawler.start(1); }
From source file:cn.edu.hfut.dmic.webcollector.example.FirefoxSelenium3.java
License:Open Source License
public static void main(String[] args) throws Exception { Executor executor = new Executor() { @Override/*from w ww .j ava2 s .co m*/ public void execute(CrawlDatum datum, CrawlDatums next) throws Exception { MongoClient mongoClient = new MongoClient("localhost", 27017); // ? // DBCollection dbCollection = mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("attend_rate")) { db.getCollection(s).drop(); } } DBCollection dbCollection = db.getCollection("attend_rate"); ProfilesIni pi = new ProfilesIni(); FirefoxProfile profile = pi.getProfile("default"); WebDriver driver = new FirefoxDriver(profile); driver.manage().window().maximize(); driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS); // driver.setJavascriptEnabled(false); driver.get(datum.getUrl()); // System.out.println(driver.getPageSource()); driver.findElement(By.xpath("//*[@id='seat_city']")).click(); driver.switchTo().window(driver.getWindowHandle()); int city_num = driver.findElements(By.xpath("//div[@id='all-citys']/div/ul/li/a")).size(); for (int i = 0; i < city_num; i++) { System.out.println("A city chosen" + i); System.out.println( driver.findElements(By.xpath("//div[@id='all-citys']/div/ul/li/a")).get(i).getText()); String city = driver.findElements(By.xpath("//div[@id='all-citys']/div/ul/li/a")).get(i) .getText(); ((JavascriptExecutor) driver).executeScript("arguments[0].scrollIntoView(true);", driver.findElements(By.xpath("//div[@id='all-citys']/div/ul/li/a")).get(i)); ((JavascriptExecutor) driver).executeScript("window.scrollBy(0, -250)", ""); Thread.sleep(1000); new Actions(driver) .moveToElement( driver.findElements(By.xpath("//div[@id='all-citys']/div/ul/li/a")).get(i)) .click().perform(); driver.switchTo().window(driver.getWindowHandle()); // System.out.println(driver.findElement(By.xpath("//span[@class='today']/em")).getText()); System.out.println(driver.findElement(By.xpath("//span[@class='today']")).getText()); for (int j = 0; j < driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']")) .size(); j++) { System.out.println(driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']")) .get(j).getText()); System.out.println( driver.findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c2 red']")) .get(j).getText()); System.out.println( driver.findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c3 gray']")) .get(j).getText()); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", driver.findElement(By.xpath("//span[@class='today']")).getText()) .append("city", city) .append("mov_cnname", driver.findElements( By.xpath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']")) .get(j).getText()) .append("boxoffice_rate", driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c2 red']")) .get(j).getText()) .append("visit_pershow", driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c3 gray']")) .get(j).getText()); dbCollection.insert(dbObject); } System.out.println("new city list to choose"); new Actions(driver).moveToElement(driver.findElement(By.xpath("//*[@id='seat_city']"))).click() .perform(); driver.switchTo().window(driver.getWindowHandle()); Thread.sleep(500); } driver.close(); driver.quit(); mongoClient.close(); } }; //DBDBManager DBManager manager = new BerkeleyDBManager("crawl"); //Crawler?DBManagerExecutor Crawler crawler = new Crawler(manager, executor); crawler.addSeed("http://pf.maoyan.com/attend/rate"); crawler.start(1); }
From source file:cn.edu.hfut.dmic.webcollector.example.FirefoxSelenium4.java
License:Open Source License
public static void main(String[] args) throws Exception { Executor executor = new Executor() { @Override/*from w w w . j av a2 s. c o m*/ public void execute(CrawlDatum datum, CrawlDatums next) throws Exception { MongoClient mongoClient = new MongoClient("localhost", 27017); // ? // DBCollection dbCollection = mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_am")) { db.getCollection(s).drop(); } } DBCollection dbCollection = db.getCollection("attend_rate"); ProfilesIni pi = new ProfilesIni(); FirefoxProfile profile = pi.getProfile("default"); WebDriver driver = new FirefoxDriver(profile); driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS); // driver.setJavascriptEnabled(false); driver.get(datum.getUrl()); // System.out.println(driver.getPageSource()); List<WebElement> movie_name = driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c1 lineDot']")); List<WebElement> boxoffice_rate = driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c2 red']")); List<WebElement> visit_pershow = driver .findElements(By.xpath("//div[@id='seat_table']//ul//li[@class='c3 gray']")); WebElement title = driver.findElement(By.xpath("//span[@class='today']/em")); WebElement title2 = driver.findElement(By.xpath("//span[@class='today']")); System.out.println(title.getText()); System.out.println(title.getText()); for (int i = 0; i < movie_name.size(); i++) { System.out.println(movie_name.get(i).getText()); System.out.println(boxoffice_rate.get(i).getText()); System.out.println(visit_pershow.get(i).getText()); // BasicDBObject dbObject = new BasicDBObject(); // dbObject.append("title", title).append("rank", amList.get(0)).append("mov_cnname", cn_name).append("mov_enname", en_name).append("toweek_rev", amList.get(2)).append("total_rev", amList.get(3)).append("val_week", amList.get(4)); // dbCollection.insert(dbObject); } driver.quit(); } }; //DBDBManager DBManager manager = new BerkeleyDBManager("crawl"); //Crawler?DBManagerExecutor Crawler crawler = new Crawler(manager, executor); crawler.addSeed("http://pf.maoyan.com/attend/rate"); crawler.start(1); }
From source file:cn.edu.hfut.dmic.webcollector.example.TutorialCrawler.java
License:Open Source License
@Override public void visit(Page page, CrawlDatums next) { if (page.matchUrl("http://pf.maoyan.com/rankings/america.*")) { MongoClient mongoClient = new MongoClient("localhost", 27017); // ?// w w w . j a va 2 s . c o m // DBCollection dbCollection = mongoClient.getDB("maoyan_crawler").getCollection("rankings_am"); DB db = mongoClient.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_am")) { db.getCollection(s).drop(); } } DBCollection dbCollection = db.getCollection("rankings_am"); String title = page.select("span[id=year-box]").text(); Elements table = page.select("table[id=na-list]"); Elements data_set = table.select("tr"); List amList = new ArrayList(); for (Element id : data_set) { Elements tds = id.select("td"); for (Element td : tds) { amList.add(td.text()); } String en_name = tds.select("p[class=first-line]").text(); String cn_name = tds.select("p[class=second-line]").text(); if (amList.size() > 0) { System.out.println(amList); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", title).append("rank", amList.get(0)).append("mov_cnname", cn_name) .append("mov_enname", en_name).append("toweek_rev", amList.get(2)) .append("total_rev", amList.get(3)).append("val_week", amList.get(4)); amList.removeAll(amList); dbCollection.insert(dbObject); } } mongoClient.close(); } else if (page.matchUrl("http://pf.maoyan.com/rankings/day.*")) { MongoClient mongoClient2 = new MongoClient("localhost", 27017); // ? DB db = mongoClient2.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_day")) { db.getCollection(s).drop(); } } DBCollection dbCollection2 = db.getCollection("rankings_day"); String title = page.select("span[id=year-box]").text(); String update_time = page.select("span[id=update-time]").text(); title = title + update_time; System.out.println(title); Elements data_set = page.select("tr"); List dayList = new ArrayList(); for (Element id : data_set) { Elements tds = id.select("td"); for (Element td : tds) { dayList.add(td.text()); } if (dayList.size() > 0) { System.out.println(dayList); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", title).append("rank", dayList.get(0)) .append("mov_name", dayList.get(1)).append("today_rev", dayList.get(2)) .append("date", dayList.get(3)).append("val_week", dayList.get(4)); dayList.removeAll(dayList); dbCollection2.insert(dbObject); } } mongoClient2.close(); } else if (page.matchUrl("http://pf.maoyan.com/rankings/market.*")) { MongoClient mongoClient3 = new MongoClient("localhost", 27017); // ? DB db = mongoClient3.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_market")) { db.getCollection(s).drop(); } } DBCollection dbCollection3 = db.getCollection("rankings_market"); String title = page.select("span[id=year-box]").text(); String update_time = page.select("span[id=update-time]").text(); title = title + update_time; System.out.println(title); Elements data_set = page.select("tr"); List dayList = new ArrayList(); for (Element id : data_set) { Elements tds = id.select("td"); for (Element td : tds) { dayList.add(td.text()); } if (dayList.size() > 0) { System.out.println(dayList); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", title).append("rank", dayList.get(0)).append("date", dayList.get(1)) .append("today_rev", dayList.get(2)).append("total_sessions", dayList.get(3)) .append("total_visit_count", dayList.get(4)); dayList.removeAll(dayList); dbCollection3.insert(dbObject); } } mongoClient3.close(); } else if (page.matchUrl("http://pf.maoyan.com/rankings/year.*")) { MongoClient mongoClient4 = new MongoClient("localhost", 27017); // ? DB db = mongoClient4.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("rankings_year")) { db.getCollection(s).drop(); } } DBCollection dbCollection4 = db.getCollection("rankings_year"); String title = page.select("span[id=year-box]").text(); String update_time = page.select("span[id=update-time]").text(); title = title + update_time; System.out.println(title); Elements table = page.select("div[id=ranks-list]"); // System.out.println(table); Elements data_set = table.select("ul[class=row]"); // System.out.println(data_set); List dayList = new ArrayList(); for (Element id : data_set) { Elements lis = id.select("li"); for (Element li : lis) { dayList.add(li.text()); } String cn_name = lis.select("p[class=first-line]").text(); String release_date = lis.select("p[class=second-line]").text(); if (dayList.size() > 0) { System.out.println(dayList); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", title).append("rank", dayList.get(0)).append("name", cn_name) .append("release date", release_date).append("year_rev", dayList.get(2)) .append("avg_price", dayList.get(3)).append("avg_visit_count", dayList.get(4)); dayList.removeAll(dayList); dbCollection4.insert(dbObject); } } mongoClient4.close(); } else if (page.matchUrl("http://pf.maoyan.com/")) { MongoClient mongoClient5 = new MongoClient("localhost", 27017); // ? DB db = mongoClient5.getDB("maoyan_crawler"); // ????? Set<String> colls = db.getCollectionNames(); for (String s : colls) { // Collection(?"") if (s.equals("main_page")) { db.getCollection(s).drop(); } } DBCollection dbCollection5 = db.getCollection("main_page"); String title = page.select("span[id=dayStr]").text(); String box_type = page.select("span[id=box-type]").text(); String ticket_count = page.select("span[id=ticket_count]").text(); box_type = box_type + ticket_count; System.out.println(title + "\n" + box_type); Elements table = page.select("div[id=ticket_tbody]"); //System.out.println(table); Elements data_set = table.select("ul"); //System.out.println(data_set); List dayList = new ArrayList(); for (Element id : data_set) { Elements lis = id.select("li"); for (Element li : lis) { dayList.add(li.text()); } // System.out.println(lis); String cn_name = lis.select("b").first().text(); String comment = lis.select("em").text(); // System.out.println(cn_name+ "\n" +comment); if (dayList.size() > 0) { System.out.println(dayList); BasicDBObject dbObject = new BasicDBObject(); dbObject.append("title", title).append("box_type", box_type).append("name", cn_name) .append("comment", comment).append("realtime_rev", dayList.get(1)) .append("rev_percent", dayList.get(2)).append("schedule_percent", dayList.get(3)) .append("total_rev", dayList.get(4)); dayList.removeAll(dayList); dbCollection5.insert(dbObject); } } mongoClient5.close(); } }
From source file:cn.edu.hfut.dmic.webcollector.example.WeiboCrawler.java
License:Open Source License
@Override public void visit(Page page, CrawlDatums next) { int pageNum = Integer.valueOf(page.getMetaData("pageNum")); /*??*//* w w w . ja va2 s. co m*/ Elements weibos = page.select("div.c"); try { MongoClient mongoClient = new MongoClient("localhost", 27017); // ? MongoDatabase mongoDatabase = mongoClient.getDatabase("weibo_crawler"); System.out.println("Connect to database successfully"); MongoCollection<Document> collection = mongoDatabase.getCollection("webpage"); //? /** * 1. org.bson.Document ?key-value? * 2. ?List<Document> * 3. ???? mongoCollection.insertMany(List<Document>) ??? mongoCollection.insertOne(Document) * */ for (Element weibo : weibos) { Document document = new Document("content", "" + pageNum + "" + ":" + weibo.text()); List<Document> documents = new ArrayList<Document>(); documents.add(document); collection.insertMany(documents); } System.out.println("??"); mongoClient.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); } }
From source file:cn.edu.hfut.dmic.webcollector.example.WeiboCrawler.java
License:Open Source License
public static void main(String[] args) throws Exception { MongoClient mongoClient = new MongoClient("localhost", 27017); WeiboCrawler crawler = new WeiboCrawler("weibo_crawler", mongoClient, false); crawler.setThreads(3);/* w w w . j av a 2 s . com*/ /*???5?*/ for (int i = 1; i <= 5; i++) { crawler.addSeed( new CrawlDatum("http://weibo.cn/zhouhongyi?vt=4&page=" + i).putMetaData("pageNum", i + "")); } crawler.start(1); }
From source file:cn.edu.hfut.dmic.webcollector.lazy.util.MongoHelper.java
License:Open Source License
public MongoHelper(String ip, int port, String dbName, String collectionName) { client = new MongoClient(ip, port); db = client.getDatabase(dbName);//from w w w. j a v a 2 s .co m collection = db.getCollection(collectionName); }