org.seo.rank.impl.BaiduCopyChecker.java Source code

Java tutorial

Introduction

Here is the source code for org.seo.rank.impl.BaiduCopyChecker.java

Source

/*
 * *
 *  *
 *  * APDPlat - Application Product Development Platform
 *  * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 *  *
 *  * This program is free software: you can redistribute it and/or modify
 *  * it under the terms of the GNU General Public License as published by
 *  * the Free Software Foundation, either version 3 of the License, or
 *  * (at your option) any later version.
 *  *
 *  * This program is distributed in the hope that it will be useful,
 *  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  * GNU General Public License for more details.
 *  *
 *  * You should have received a copy of the GNU General Public License
 *  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *  *
 *
 */

package org.seo.rank.impl;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.CopyChecker;
import org.seo.rank.tools.DynamicIp;
import org.seo.rank.list.UrlTools;
import org.seo.rank.list.impl.DefaultParser;
import org.seo.rank.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * 
 * @author ??
 */
public class BaiduCopyChecker implements CopyChecker {
    private static final Logger LOGGER = LoggerFactory.getLogger(BaiduCopyChecker.class);
    private static final String ACCEPT = "text/html, */*; q=0.01";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String HOST = "www.baidu.com";
    private static final String REFERER = "http://www.baidu.com";
    private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";

    //?
    private static final int PAGE = 15;
    private static final int PAGESIZE = 10;

    @Override
    public Map<Article, Set<String>> check(List<Article> articles) {
        Map<Article, Set<String>> data = new HashMap<>();
        articles.forEach(article -> {
            data.put(article, doCheck(article));
        });
        return data;
    }

    public Set<String> doCheck(Article article) {
        Set<String> data = new HashSet<>();
        if (StringUtils.isBlank(article.getTitle()) || StringUtils.isBlank(article.getUrl())) {
            return data;
        }
        String query = null;
        try {
            query = URLEncoder.encode(article.getTitle(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            LOGGER.error("url", e);
            return data;
        }
        if (StringUtils.isBlank(query)) {
            return data;
        }
        for (int i = 0; i < PAGE; i++) {
            String url = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query + "&oq=" + query
                    + "&usm=3&f=8&bs=" + query + "&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn="
                    + i * PAGESIZE;
            LOGGER.debug(url);
            data.addAll(doCheck(url, article));
        }
        return data;
    }

    private Set<String> doCheck(String url, Article article) {
        Set<String> data = new HashSet<>();
        try {
            Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Host", HOST)
                    .header("Referer", REFERER).header("User-Agent", USER_AGENT).get();
            String titleCssQuery = "html body div div div div div h3.t a";
            Elements elements = document.select(titleCssQuery);
            int i = 0;
            for (Element element : elements) {
                String _title = element.text();
                if (StringUtils.isBlank(_title)) {
                    continue;
                }
                i++;
                LOGGER.debug(i + ":" + _title);
                if (_title.contains("") || !contains(_title, article.getTitle())) {
                    LOGGER.debug("?");
                    continue;
                }
                String href = element.attr("href");
                href = UrlTools.normalizeUrl(url, href);
                String realUrl = urlConvert(href);
                LOGGER.debug("url:" + url);
                LOGGER.debug("realUrl:" + realUrl);
                String[] target = new URL(realUrl).getHost().split("\\.");
                String[] source = new URL(article.getUrl()).getHost().split("\\.");
                if (target.length > 1 && source.length > 1
                        && !(target[target.length - 2] + target[target.length - 1])
                                .equals(source[source.length - 2] + source[source.length - 1])) {
                    data.add(realUrl);
                }
            }
        } catch (Exception ex) {
            LOGGER.error("?", ex);
        }
        return data;
    }

    /**
     * title2??title1
     * @param title2
     * @param title1
     * @return
     */
    private static boolean contains(String title2, String title1) {
        StringBuilder str2 = new StringBuilder();
        StringBuilder str1 = new StringBuilder();
        for (char c : title2.toCharArray()) {
            if (Character.isLetter(c)) {
                str2.append(c);
            }
        }
        for (char c : title1.toCharArray()) {
            if (Character.isLetter(c)) {
                str1.append(c);
            }
        }
        LOGGER.debug("??" + title2);
        LOGGER.debug("??" + str2.toString());
        LOGGER.debug("??" + title1);
        LOGGER.debug("??" + str1.toString());
        if (str2.toString().contains(str1.toString())) {
            LOGGER.debug(title2 + " ?? " + title1);
            return true;
        }
        LOGGER.debug(title2 + " ??? " + title1);
        return false;
    }

    /**
     * ?
     * @param url 
     * @return 
     */
    private static String urlConvert(String url) {
        try {
            if (!url.startsWith("http://www.baidu.com/link?url=")) {
                //???URL
                return url;
            }
            LOGGER.debug("??URL" + url);
            Connection.Response response = getResponse(url);
            //???
            if (response == null || response.body().contains("??")
                    || response.body().contains("??")) {
                //IP?
                DynamicIp.toNewIp();
                response = getResponse(url);
            }
            String realUrl = response.header("Location");
            LOGGER.debug("??URL" + realUrl);
            //???
            //
            //????ITEYE???
            /*
            LOGGER.debug("???"+realUrl);
            Connection.Response response = getResponse(realUrl);
            //???
            if(response==null || response.body().contains("??")
                          || response.body().contains("??")){
            //IP?
            DynamicIp.toNewIp();
            response = getResponse(realUrl);
            }
            String realUrl2 = response.header("Location");
            if(!StringUtils.isBlank(realUrl2)){
            LOGGER.debug("??"+realUrl2);
            return realUrl2;
            }
            */
            return realUrl;
        } catch (Exception e) {
            LOGGER.error("URL?", e);
        }
        return url;
    }

    private static Connection.Response getResponse(String url) {
        try {
            Connection.Response response = Jsoup.connect(url).header("Accept", ACCEPT)
                    .header("Accept-Encoding", ENCODING).header("Accept-Language", LANGUAGE)
                    .header("Connection", CONNECTION).header("Host", HOST).header("Referer", REFERER)
                    .header("User-Agent", USER_AGENT).ignoreContentType(true).timeout(30000).followRedirects(false)
                    .execute();
            return response;
        } catch (Exception e) {
            LOGGER.debug("??", e);
        }
        return null;
    }

    public static void main(String[] args) {
        CopyChecker copyChecker = new BaiduCopyChecker();
        //OSCHINA?
        //List<Article> articles = DefaultParser.oschinaBlog();
        //ITEYE?
        List<Article> articles = DefaultParser.iteyeBlog();
        //??
        articles = articles.stream()
                .filter(article -> !(article.getTitle().contains("idioms")
                        || article.getTitle().contains("?Tachyon")
                        || article.getTitle().contains("Nutch")
                        || article.getTitle().contains("BUG")
                        || article.getTitle().contains("?")
                        || article.getTitle().contains("?")
                        || article.getTitle().contains("??")
                        || article.getTitle().contains("The Future of Compass & ElasticSearch")
                        || article.getTitle().contains("1208???")
                        || article.getTitle().contains("Java")
                        || article.getTitle().contains("What a Wonderful Code")
                        || article.getTitle().contains("?")
                        || article.getTitle().contains("Linux Netcat command  The swiss army knife of net")
                        || article.getTitle().contains("common prefix different suffix")))
                .collect(Collectors.toList());
        //
        Map<Article, Set<String>> result = copyChecker.check(articles);
        //
        LOGGER.info("<h4>?" + articles.size() + "</h4>");
        AtomicInteger i = new AtomicInteger();
        result.entrySet().stream().sorted((a, b) -> b.getValue().size() - a.getValue().size()).forEach(e -> {
            String query = null;
            try {
                query = URLEncoder.encode(e.getKey().getTitle(), "UTF-8");
            } catch (UnsupportedEncodingException ex) {
                LOGGER.error("url", ex);
                return;
            }
            String originURL = e.getKey().getUrl();
            if (e.getValue().size() > 0) {
                LOGGER.info("<h4>" + i.incrementAndGet()
                        + "?<a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">"
                        + e.getKey().getTitle() + "</a>  (" + e.getValue().size() + ")</h4>");
                LOGGER.info("<a target=\"_blank\" href=\"" + originURL + "\">" + originURL
                        + "</a><br/>");
                LOGGER.info("");
                LOGGER.info("<ol>");
                e.getValue().stream().sorted().forEach(
                        url -> LOGGER.info("<li><a target=\"_blank\" href=\"" + url + "\">" + url + "</a></li>"));
                LOGGER.info("</ol>");
            } else {
                LOGGER.info(i.incrementAndGet() + "?<a target=\"_blank\" href=\"http://www.baidu.com/s?wd="
                        + query + "\">" + e.getKey().getTitle() + "</a><br/>");
                LOGGER.info("<a target=\"_blank\" href=\"" + originURL + "\">" + originURL
                        + "</a>    <br/>");
            }
        });
    }
}