Java tutorial
/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.seo.rank.impl; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.seo.rank.Ranker; import org.seo.rank.tools.DynamicIp; import org.seo.rank.list.UrlTools; import org.seo.rank.list.impl.DefaultParser; import org.seo.rank.model.Article; import org.seo.rank.model.Rank; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * ?????? * @author ?? */ public class BaiduRanker implements Ranker { private static final Logger LOGGER = LoggerFactory.getLogger(BaiduRanker.class); private static final String ACCEPT = "text/html, */*; q=0.01"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.baidu.com"; private static final String REFERER = "http://www.baidu.com"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0"; //? private static final int PAGE = 15; private static final int PAGESIZE = 10; @Override public void rank(List<Rank> ranks) { for (Rank rank : ranks) { rank(rank); } } @Override public void rank(Rank rank) { doRank(rank); } /** * ?? * @param rank ??? */ public void doRank(Rank rank) { if (StringUtils.isBlank(rank.getKeyword()) || StringUtils.isBlank(rank.getUrl())) { return; } //? searchBaiduIndex(rank); if (!rank.isIndex()) { return; } //?? String query = null; try { query = URLEncoder.encode(rank.getKeyword(), "UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("url", e); return; } if (StringUtils.isBlank(query)) { return; } for (int i = 0; i < PAGE; i++) { String path = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query + "&oq=" + query + "&usm=3&f=8&bs=" + query + "&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn=" + i * PAGESIZE; LOGGER.debug(path); int r = searchBaiduRank(path, rank); if (r > 0) { rank.setRank(r + i * 10); //?? return; } } } /** * ? * @param rank */ private void searchBaiduIndex(Rank rank) { String url = "url:" + rank.getUrl(); url = "http://www.baidu.com/s?wd=" + url; LOGGER.debug(url); try { Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION) .header("User-Agent", USER_AGENT).header("Host", HOST).get(); String notFoundCssQuery = "html body div div div div div p"; Elements elements = document.select(notFoundCssQuery); for (Element element : elements) { String text = element.text(); if (text.contains("") && text.contains("")) { // LOGGER.debug(""); rank.setIndex(false); return; } } String numberCssQuery = "html body div div div div.nums"; elements = document.select(numberCssQuery); for (Element element : elements) { String text = element.text(); if (text.equals("1")) { // LOGGER.debug(""); rank.setIndex(true); return; } } } catch (IOException ex) { LOGGER.error("?", ex); } LOGGER.debug(""); } /** * ?? * @param url URL * @param rank ?? * @return */ private int searchBaiduRank(String url, Rank rank) { String targetUrl = rank.getUrl(); try { Document document = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Host", HOST) .header("Referer", REFERER).header("User-Agent", USER_AGENT).get(); String titleCssQuery = "html body div div div div div h3.t a"; Elements elements = document.select(titleCssQuery); int i = 0; for (Element element : elements) { String title = element.text(); if (StringUtils.isBlank(title)) { continue; } i++; LOGGER.debug(i + ":" + title); if (!title.contains(rank.getKeyword())) { LOGGER.debug("???"); continue; } String href = element.attr("href"); href = UrlTools.normalizeUrl(url, href); String realUrl = urlConvert(href); LOGGER.debug("url:" + url); LOGGER.debug("realUrl:" + realUrl); LOGGER.debug("targetUrl:" + targetUrl); if (targetUrl.equals(realUrl)) { return i; } } } catch (Exception ex) { LOGGER.error("?", ex); } return -1; } /** * ? * @param url * @return */ private static String urlConvert(String url) { try { if (!url.startsWith("http://www.baidu.com/link?url=")) { //???URL return url; } LOGGER.debug("??URL" + url); Connection.Response response = getResponse(url); //??? if (response == null || response.body().contains("??") || response.body().contains("??")) { //IP? DynamicIp.toNewIp(); response = getResponse(url); } String realUrl = response.header("Location"); LOGGER.debug("??URL" + realUrl); //??? // //????ITEYE??? /* LOGGER.debug("???"+realUrl); Connection.Response response = getResponse(realUrl); //??? if(response==null || response.body().contains("??") || response.body().contains("??")){ //IP? DynamicIp.toNewIp(); response = getResponse(realUrl); } String realUrl2 = response.header("Location"); if(!StringUtils.isBlank(realUrl2)){ LOGGER.debug("??"+realUrl2); return realUrl2; } */ return realUrl; } catch (Exception e) { LOGGER.error("URL?", e); } return url; } private static Connection.Response getResponse(String url) { try { Connection.Response response = Jsoup.connect(url).header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING).header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION).header("Host", HOST).header("Referer", REFERER) .header("User-Agent", USER_AGENT).ignoreContentType(true).timeout(30000).followRedirects(false) .execute(); return response; } catch (Exception e) { LOGGER.debug("??", e); } return null; } public static void main(String[] args) { BaiduRanker ranker = new BaiduRanker(); /* Rank rank = new Rank(); rank.setKeyword("Java???APDPlat??"); rank.setUrl("http://www.iteye.com/magazines/113"); ranker.searchBaiduIndex(rank); System.out.println(rank); rank = new Rank(); rank.setKeyword("Java???APDPlat??"); rank.setUrl("http://www.iteye.com/magazines/113"); ranker.rank(rank); System.out.println(rank); rank = new Rank(); rank.setKeyword("QuestionAnsweringSystem v1.1 ?"); rank.setUrl("http://yangshangchuan.iteye.com/blog/2101533"); ranker.searchBaiduIndex(rank); System.out.println(rank); rank = new Rank(); rank.setKeyword("?"); rank.setUrl("http://www.manmankan.com/dy2013/zongyi/201306/6.shtml"); ranker.rank(rank); System.out.println(rank); */ //OSCHINA??? //List<Article> articles = DefaultParser.oschinaBlog(); //ITEYE??? List<Article> articles = DefaultParser.iteyeBlog(); //???? List<Rank> ranks = new ArrayList<>(); articles.forEach(blog -> { Rank rank = new Rank(); rank.setKeyword(blog.getTitle()); rank.setUrl(blog.getUrl()); ranks.add(rank); }); //???? ranker.rank(ranks); //??? Map<String, Integer> map = new HashMap<>(); ranks.forEach(rank -> map.put(rank.getKeyword(), rank.getRank())); LOGGER.info("???" + ranks.size()); LOGGER.info("<ol>"); map.entrySet().stream().sorted((a, b) -> a.getValue() - b.getValue()).forEach(e -> { String query = null; try { query = URLEncoder.encode(e.getKey(), "UTF-8"); } catch (UnsupportedEncodingException ex) { LOGGER.error("url", ex); return; } LOGGER.info("<li><a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey() + "(" + e.getValue() + ")</a></li>"); }); LOGGER.info("</ol>"); } }