org.seo.rank.list.impl.DefaultParser.java Source code

Java tutorial

Introduction

Here is the source code for org.seo.rank.list.impl.DefaultParser.java

Source

/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.seo.rank.list.impl;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.list.Parser;
import org.seo.rank.list.UrlTools;
import org.seo.rank.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author ??
 */
public class DefaultParser implements Parser {
    private static final Logger LOGGER = LoggerFactory.getLogger(DefaultParser.class);
    private static final String ACCEPT = "text/html, */*; q=0.01";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";

    @Override
    public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) {
        List<Article> articles = new ArrayList<>();
        try {
            Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION)
                    .header("User-Agent", USER_AGENT).get();
            Elements elements = document.select(titleCssQuery);
            for (Element element : elements) {
                String title = element.text();
                String href = element.attr("href");
                if (!StringUtils.isBlank(title) && !StringUtils.isBlank(href)) {
                    href = UrlTools.normalizeUrl(url, href);
                    Article article = new Article();
                    article.setTitle(title);
                    article.setUrl(href);
                    articles.add(article);
                } else {
                    LOGGER.info("?" + url + " title:" + title + ", href:" + href);
                }
            }
            //??
            String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText);
            LOGGER.debug("" + nextPageUrl);
            if (nextPageUrl != null) {
                nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl);
                LOGGER.debug("?" + nextPageUrl);
                //?
                List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery);
                articles.addAll(result);
            } else {
                LOGGER.info("??" + url);
            }
        } catch (Exception e) {
            LOGGER.error("?" + url, e);
        }
        return articles;
    }

    /**
     * ??
     * @param document 
     * @param nextPageCssQuery ?CSS
     * @param nextPageText CSS
     * @return ?
     */
    private String getNextPageUrl(Document document, String nextPageCssQuery, String nextPageText) {
        Elements elements = document.select(nextPageCssQuery);
        for (Element element : elements) {
            String text = element.text();
            LOGGER.debug(text);
            if (text != null && nextPageText.trim().equals(text.trim())) {
                String href = element.attr("href");
                return href;
            }
        }
        return null;
    }

    public static List<Article> run(String url, String nextPageCssQuery, String nextPageText,
            String titleCssQuery) {
        Parser parser = new DefaultParser();
        long start = System.currentTimeMillis();
        List<Article> articles = parser.parse(url, nextPageCssQuery, nextPageText, titleCssQuery);
        long cost = System.currentTimeMillis() - start;
        int i = 1;
        for (Article article : articles) {
            LOGGER.info((i++) + "?" + article.getTitle() + " : " + article.getUrl());
        }
        LOGGER.info(" " + articles.size() + " " + cost / 1000.0 + " ");
        return articles;
    }

    public static List<Article> iteyeBlog() {
        String url = "http://yangshangchuan.iteye.com/";
        String nextPageCssQuery = "html body div#page div#content.clearfix div#main div.pagination a.next_page";
        String nextPageText = " ";
        String titleCssQuery = "html body div#page div#content.clearfix div#main div.blog_main div.blog_title h3 a";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> iteyeNews() {
        String url = "http://www.iteye.com/news";
        String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
        String nextPageText = " ";
        //h3 > ah3??a h3 span.category a ?
        String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 > a";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> iteyeMagazines() {
        String url = "http://www.iteye.com/magazines";
        String nextPageCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.pagination a.next_page";
        String nextPageText = " ";
        String titleCssQuery = "html body div#page div#content.clearfix div#main div#index_main div.news.clearfix div.content h3 a";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> csdnBlog() {
        String url = "http://blog.csdn.net/iispring";
        String nextPageCssQuery = "html body div#container div#body div#main div.main div#papelist.pagelist a";
        String titleCssQuery = "html body div#container div#body div#main div.main div#article_list.list div.list_item.article_item div.article_title h1 span.link_title a";
        String nextPageText = "";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> oschinaNews() {
        String url = "http://www.oschina.net/news";
        String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.pager li.page.next a";
        String titleCssQuery = "html body div#OSC_Screen div#OSC_Content.CenterDiv div#NewsChannel.Channel div#NewsList.ListPanel div#RecentNewsList.panel ul.List li h2 a";
        String nextPageText = ">";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> oschinaBlog() {
        String url = "http://my.oschina.net/apdplat/blog";
        String nextPageCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.pages.sm-hide ul li a";
        String titleCssQuery = "html body div#OSC_Screen div#OSC_Content div.blog-user div.container div.flex-item div#search_list.flex-item-9.flex-item-md-9.content div#list.list.blog-list div.list-item div.layout div.layout-column div.title a";
        String nextPageText = "";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    public static List<Article> baidu(String query) {
        //??
        try {
            query = URLEncoder.encode(query, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            LOGGER.error("url", e);
            return Collections.emptyList();
        }
        if (StringUtils.isBlank(query)) {
            return Collections.emptyList();
        }
        String url = "http://www.baidu.com/s?wd=" + query;
        String nextPageCssQuery = "html body div div div p#page a.n";
        String titleCssQuery = "html body div div div div div h3.t a";
        String nextPageText = ">";
        return run(url, nextPageCssQuery, nextPageText, titleCssQuery);
    }

    /**
     * OSCHINA?ITEYE??
     */
    public static void blogCompare() {
        List<Article> ob = oschinaBlog();
        List<Article> ib = iteyeBlog();
        Map<String, String> om = new HashMap<>();
        Map<String, String> im = new HashMap<>();
        ob.stream().forEach(b -> om.put(b.getTitle(), b.getUrl()));
        ib.stream().forEach(b -> im.put(b.getTitle(), b.getUrl()));
        List<String> iteyeBlog = ib.stream().map(b -> b.getTitle().replace("[]", "").trim()).sorted()
                .collect(Collectors.toList());
        List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList());

        List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList());
        LOGGER.info("<h4>oschinaiteye(" + commons.size() + ")</h4>");
        AtomicInteger j = new AtomicInteger();
        commons.forEach(item -> LOGGER
                .info(j.incrementAndGet() + "?" + item + "    <a target=\"_blank\" href=\"" + om.get(item)
                        + "\">oschina</a>    <a target=\"_blank\" href=\"" + im.get(item) + "\">iteye</a><br/>"));

        List<String> oschina = oschinaBlog.stream().filter(i -> !iteyeBlog.contains(i))
                .collect(Collectors.toList());
        LOGGER.info("<h4>oschina(" + oschina.size() + ")</h4>");
        AtomicInteger l = new AtomicInteger();
        oschina.forEach(item -> LOGGER.info(l.incrementAndGet() + "?<a target=\"_blank\" href=\"" + om.get(item)
                + "\">" + item + "</a><br/>"));

        List<String> iteye = iteyeBlog.stream().filter(i -> !oschinaBlog.contains(i)).collect(Collectors.toList());
        LOGGER.info("<h4>iteye(" + iteye.size() + ")</h4>");
        AtomicInteger k = new AtomicInteger();
        iteye.forEach(item -> LOGGER.info(k.incrementAndGet() + "?<a target=\"_blank\" href=\"" + im.get(item)
                + "\">" + item + "</a><br/>"));
    }

    public static void main(String[] args) {
        //iteyeBlog();
        //iteyeNews();
        //iteyeMagazines();
        //csdnBlog();
        //oschinaNews();
        //oschinaBlog();
        //baidu("Java???APDPlat??");
        blogCompare();
    }
}