Java tutorial
/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.seo.rank.list.impl; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.seo.rank.list.Parser; import org.seo.rank.list.UrlTools; import org.seo.rank.model.Article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author ?? */ public class DefaultParser implements Parser { private static final Logger LOGGER = LoggerFactory.getLogger(DefaultParser.class); private static final String ACCEPT = "text/html, */*; q=0.01"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0"; @Override public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) { List<Article> articles = new ArrayList<>(); try { Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION) .header("User-Agent", USER_AGENT).get(); Elements elements = document.select(titleCssQuery); for (Element element : elements) { String title = element.text(); String href = element.attr("href"); if (!StringUtils.isBlank(title) && !StringUtils.isBlank(href)) { href = UrlTools.normalizeUrl(url, href); Article article = new Article(); article.setTitle(title); article.setUrl(href); articles.add(article); } else { LOGGER.info("?" + url + " title:" + title + ", href:" + href); } } //?? String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText); LOGGER.debug("" + nextPageUrl); if (nextPageUrl != null) { nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl); LOGGER.debug("?" + nextPageUrl); //? List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery); articles.addAll(result); } else { LOGGER.info("??" + url); } } catch (Exception e) { LOGGER.error("?" + url, e); } return articles; } /** * ?? * @param document * @param nextPageCssQuery ?CSS * @param nextPageText CSS * @return ? */ private String getNextPageUrl(Document document, String nextPageCssQuery, String nextPageText) { Elements elements = document.select(nextPageCssQuery); for (Element element : elements) { String text = element.text(); LOGGER.debug(text); if (text != null && nextPageText.trim().equals(text.trim())) { String href = element.attr("href"); return href; } } return null; } public static List<Article> run(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) { Parser parser = new DefaultParser(); long start = System.currentTimeMillis(); List<Article> articles = parser.parse(url, nextPageCssQuery, nextPageText, titleCssQuery); long cost = System.currentTimeMillis() - start; int i = 1; for (Article article : articles) { LOGGER.info((i++) + "?" + article.getTitle() + " : " + article.getUrl()); } LOGGER.info(" " + articles.size() + " " + cost / 1000.0 + " "); return articles; } public static List<Article> iteyeBlog() { String url = "http://yangshangchuan.iteye.com/"; String nextPageCssQuery = "html body div#page div#content.clearfix div#main div.pagination a.next_page"; String nextPageText = " "; String titleCssQuery = "html body div#page div#content.clearfix div#main div.blog_main div.blog_title h3 a"; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> iteyeNews() { String url = "http://www.iteye.com/news"; String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page"; String nextPageText = " "; //h3 > ah3??a h3 span.category a ? String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 > a"; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> iteyeMagazines() { String url = "http://www.iteye.com/magazines"; String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page"; String nextPageText = " "; String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 a"; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> csdnBlog() { String url = "http://blog.csdn.net/iispring"; String nextPageCssQuery = "html body div#container div#body div#main div.main div#papelist.pagelist a"; String titleCssQuery = "html body div#container div#body div#main div.main div#article_list.list div.list_item.article_item div.article_title h1 span.link_title a"; String nextPageText = ""; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> oschinaNews() { String url = "http://www.oschina.net/news"; String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.pager li.page.next a"; String titleCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.List li h2 a"; String nextPageText = ">"; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> oschinaBlog() { String url = "http://my.oschina.net/apdplat/blog"; String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.pages.sm-hide ul li a"; String titleCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.list-item div.layout div.layout-column div.title a"; String nextPageText = ""; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } public static List<Article> baidu(String query) { //?? try { query = URLEncoder.encode(query, "UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("url", e); return Collections.emptyList(); } if (StringUtils.isBlank(query)) { return Collections.emptyList(); } String url = "http://www.baidu.com/s?wd=" + query; String nextPageCssQuery = "html body div div div p#page a.n"; String titleCssQuery = "html body div div div div div h3.t a"; String nextPageText = ">"; return run(url, nextPageCssQuery, nextPageText, titleCssQuery); } /** * OSCHINA?ITEYE?? */ public static void blogCompare() { List<Article> ob = oschinaBlog(); List<Article> ib = iteyeBlog(); Map<String, String> om = new HashMap<>(); Map<String, String> im = new HashMap<>(); ob.stream().forEach(b -> om.put(b.getTitle(), b.getUrl())); ib.stream().forEach(b -> im.put(b.getTitle(), b.getUrl())); List<String> iteyeBlog = ib.stream().map(b -> b.getTitle().replace("[]", "").trim()).sorted() .collect(Collectors.toList()); List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList()); List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList()); LOGGER.info("<h4>oschinaiteye(" + commons.size() + ")</h4>"); AtomicInteger j = new AtomicInteger(); commons.forEach(item -> LOGGER .info(j.incrementAndGet() + "?" + item + " <a target=\"_blank\" href=\"" + om.get(item) + "\">oschina</a> <a target=\"_blank\" href=\"" + im.get(item) + "\">iteye</a><br/>")); List<String> oschina = oschinaBlog.stream().filter(i -> !iteyeBlog.contains(i)) .collect(Collectors.toList()); LOGGER.info("<h4>oschina(" + oschina.size() + ")</h4>"); AtomicInteger l = new AtomicInteger(); oschina.forEach(item -> LOGGER.info(l.incrementAndGet() + "?<a target=\"_blank\" href=\"" + om.get(item) + "\">" + item + "</a><br/>")); List<String> iteye = iteyeBlog.stream().filter(i -> !oschinaBlog.contains(i)).collect(Collectors.toList()); LOGGER.info("<h4>iteye(" + iteye.size() + ")</h4>"); AtomicInteger k = new AtomicInteger(); iteye.forEach(item -> LOGGER.info(k.incrementAndGet() + "?<a target=\"_blank\" href=\"" + im.get(item) + "\">" + item + "</a><br/>")); } public static void main(String[] args) { //iteyeBlog(); //iteyeNews(); //iteyeMagazines(); //csdnBlog(); //oschinaNews(); //oschinaBlog(); //baidu("Java???APDPlat??"); blogCompare(); } }