org.seo.rank.impl.BaiduRanker.java Source code

Java tutorial

Introduction

Here is the source code for org.seo.rank.impl.BaiduRanker.java

Source

/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.seo.rank.impl;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.Ranker;
import org.seo.rank.tools.DynamicIp;
import org.seo.rank.list.UrlTools;
import org.seo.rank.list.impl.DefaultParser;
import org.seo.rank.model.Article;
import org.seo.rank.model.Rank;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * ??????
 * @author ??
 */
public class BaiduRanker implements Ranker {
    private static final Logger LOGGER = LoggerFactory.getLogger(BaiduRanker.class);
    private static final String ACCEPT = "text/html, */*; q=0.01";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String HOST = "www.baidu.com";
    private static final String REFERER = "http://www.baidu.com";
    private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";

    //?
    private static final int PAGE = 15;
    private static final int PAGESIZE = 10;

    @Override
    public void rank(List<Rank> ranks) {
        for (Rank rank : ranks) {
            rank(rank);
        }
    }

    @Override
    public void rank(Rank rank) {
        doRank(rank);
    }

    /**
     * ??
     * @param rank ???
     */
    public void doRank(Rank rank) {
        if (StringUtils.isBlank(rank.getKeyword()) || StringUtils.isBlank(rank.getUrl())) {
            return;
        }
        //?
        searchBaiduIndex(rank);
        if (!rank.isIndex()) {
            return;
        }
        //??
        String query = null;
        try {
            query = URLEncoder.encode(rank.getKeyword(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            LOGGER.error("url", e);
            return;
        }
        if (StringUtils.isBlank(query)) {
            return;
        }
        for (int i = 0; i < PAGE; i++) {
            String path = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query + "&oq=" + query
                    + "&usm=3&f=8&bs=" + query + "&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn="
                    + i * PAGESIZE;
            LOGGER.debug(path);
            int r = searchBaiduRank(path, rank);
            if (r > 0) {
                rank.setRank(r + i * 10);
                //??
                return;
            }
        }
    }

    /**
     * ?
     * @param rank 
     */
    private void searchBaiduIndex(Rank rank) {
        String url = "url:" + rank.getUrl();
        url = "http://www.baidu.com/s?wd=" + url;
        LOGGER.debug(url);
        try {
            Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION)
                    .header("User-Agent", USER_AGENT).header("Host", HOST).get();

            String notFoundCssQuery = "html body div div div div div p";
            Elements elements = document.select(notFoundCssQuery);
            for (Element element : elements) {
                String text = element.text();
                if (text.contains("") && text.contains("")) {
                    //
                    LOGGER.debug("");
                    rank.setIndex(false);
                    return;
                }
            }
            String numberCssQuery = "html body div div div div.nums";
            elements = document.select(numberCssQuery);
            for (Element element : elements) {
                String text = element.text();
                if (text.equals("1")) {
                    //
                    LOGGER.debug("");
                    rank.setIndex(true);
                    return;
                }
            }
        } catch (IOException ex) {
            LOGGER.error("?", ex);
        }
        LOGGER.debug("");
    }

    /**
     * ??
     * @param url URL
     * @param rank ??
     * @return 
     */
    private int searchBaiduRank(String url, Rank rank) {
        String targetUrl = rank.getUrl();
        try {
            Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Host", HOST)
                    .header("Referer", REFERER).header("User-Agent", USER_AGENT).get();
            String titleCssQuery = "html body div div div div div h3.t a";
            Elements elements = document.select(titleCssQuery);
            int i = 0;
            for (Element element : elements) {
                String title = element.text();
                if (StringUtils.isBlank(title)) {
                    continue;
                }
                i++;
                LOGGER.debug(i + ":" + title);
                if (!title.contains(rank.getKeyword())) {
                    LOGGER.debug("???");
                    continue;
                }
                String href = element.attr("href");
                href = UrlTools.normalizeUrl(url, href);
                String realUrl = urlConvert(href);
                LOGGER.debug("url:" + url);
                LOGGER.debug("realUrl:" + realUrl);
                LOGGER.debug("targetUrl:" + targetUrl);
                if (targetUrl.equals(realUrl)) {
                    return i;
                }
            }
        } catch (Exception ex) {
            LOGGER.error("?", ex);
        }
        return -1;
    }

    /**
     * ?
     * @param url 
     * @return 
     */
    private static String urlConvert(String url) {
        try {
            if (!url.startsWith("http://www.baidu.com/link?url=")) {
                //???URL
                return url;
            }
            LOGGER.debug("??URL" + url);
            Connection.Response response = getResponse(url);
            //???
            if (response == null || response.body().contains("??")
                    || response.body().contains("??")) {
                //IP?
                DynamicIp.toNewIp();
                response = getResponse(url);
            }
            String realUrl = response.header("Location");
            LOGGER.debug("??URL" + realUrl);
            //???
            //
            //????ITEYE???
            /*
            LOGGER.debug("???"+realUrl);
            Connection.Response response = getResponse(realUrl);
            //???
            if(response==null || response.body().contains("??")
                          || response.body().contains("??")){
            //IP?
            DynamicIp.toNewIp();
            response = getResponse(realUrl);
            }
            String realUrl2 = response.header("Location");
            if(!StringUtils.isBlank(realUrl2)){
            LOGGER.debug("??"+realUrl2);
            return realUrl2;
            }
            */
            return realUrl;
        } catch (Exception e) {
            LOGGER.error("URL?", e);
        }
        return url;
    }

    private static Connection.Response getResponse(String url) {
        try {
            Connection.Response response = Jsoup.connect(url).header("Accept", ACCEPT)
                    .header("Accept-Encoding", ENCODING).header("Accept-Language", LANGUAGE)
                    .header("Connection", CONNECTION).header("Host", HOST).header("Referer", REFERER)
                    .header("User-Agent", USER_AGENT).ignoreContentType(true).timeout(30000).followRedirects(false)
                    .execute();
            return response;
        } catch (Exception e) {
            LOGGER.debug("??", e);
        }
        return null;
    }

    public static void main(String[] args) {
        BaiduRanker ranker = new BaiduRanker();
        /*
        Rank rank = new Rank();
        rank.setKeyword("Java???APDPlat??");
        rank.setUrl("http://www.iteye.com/magazines/113");
        ranker.searchBaiduIndex(rank);
        System.out.println(rank);
            
        rank = new Rank();
        rank.setKeyword("Java???APDPlat??");
        rank.setUrl("http://www.iteye.com/magazines/113");
        ranker.rank(rank);
        System.out.println(rank);
            
        rank = new Rank();
        rank.setKeyword("QuestionAnsweringSystem v1.1 ?");
        rank.setUrl("http://yangshangchuan.iteye.com/blog/2101533");
        ranker.searchBaiduIndex(rank);
        System.out.println(rank);
            
        rank = new Rank();
        rank.setKeyword("?");
        rank.setUrl("http://www.manmankan.com/dy2013/zongyi/201306/6.shtml");
        ranker.rank(rank);
        System.out.println(rank);
        */
        //OSCHINA???
        //List<Article> articles = DefaultParser.oschinaBlog();
        //ITEYE???
        List<Article> articles = DefaultParser.iteyeBlog();
        //????
        List<Rank> ranks = new ArrayList<>();
        articles.forEach(blog -> {
            Rank rank = new Rank();
            rank.setKeyword(blog.getTitle());
            rank.setUrl(blog.getUrl());
            ranks.add(rank);
        });
        //????
        ranker.rank(ranks);
        //???
        Map<String, Integer> map = new HashMap<>();
        ranks.forEach(rank -> map.put(rank.getKeyword(), rank.getRank()));
        LOGGER.info("???" + ranks.size());
        LOGGER.info("<ol>");
        map.entrySet().stream().sorted((a, b) -> a.getValue() - b.getValue()).forEach(e -> {
            String query = null;
            try {
                query = URLEncoder.encode(e.getKey(), "UTF-8");
            } catch (UnsupportedEncodingException ex) {
                LOGGER.error("url", ex);
                return;
            }
            LOGGER.info("<li><a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey()
                    + "(" + e.getValue() + ")</a></li>");
        });
        LOGGER.info("</ol>");
    }
}