Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.extract; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.SynonymAntonym; import org.apdplat.superword.model.Word; import org.apdplat.superword.tools.ProxyIp; import org.eclipse.jetty.util.ConcurrentHashSet; import org.jsoup.Connection; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * ???????? * @author ?? */ public class ChineseSynonymAntonymExtractor { private ChineseSynonymAntonymExtractor() { } private static final Logger LOGGER = LoggerFactory.getLogger(ChineseSynonymAntonymExtractor.class); private static final String SYNONYM_ANTONYM_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_3.dict_content div.industry_box div.industry.cn_synon_box"; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.iciba.com"; private static final String REFERER = "http://www.iciba.com/"; private static final List<String> USER_AGENTS = Arrays.asList( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36 OPR"); private static final AtomicInteger uac = new AtomicInteger(); private static final Map<String, String> ANTONYM = new ConcurrentHashMap<>(); private static final ExecutorService EXECUTOR_SERVICE = Executors.newCachedThreadPool(); private static final Set<String> CHECKED_WORDS = new ConcurrentHashSet<>(); //????? private static final Map<Word, Set<Word>> SYNONYM_MAP = new ConcurrentHashMap<>(); private static final Path CHECKED_WORDS_PATH = Paths.get("src/main/resources/checked_words.txt"); private static final Path CHINESE_SYNONYM = Paths.get("src/main/resources/chinese_synonym.txt"); private static final Path CHINESE_ANTONYM = Paths.get("src/main/resources/chinese_antonym.txt"); public static SynonymAntonym parseSynonymAntonym(String html, String word) { SynonymAntonym synonymAntonym = new SynonymAntonym(); synonymAntonym.setWord(new Word(word, "")); try { for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) { int size = element.children().size(); LOGGER.debug("element size:" + size); for (int i = 0; i < size / 2; i++) { String type = element.child(i * 2).text(); LOGGER.debug("type:" + type); if ("??".equals(type)) { String synonym = element.child(i * 2 + 1).text(); LOGGER.debug("synonym:" + synonym); for (String w : synonym.split("\\s+")) { w = w.replaceAll("\\s+", ""); if (w.length() < 2) { continue; } if (isNotChineseChar(w)) { LOGGER.debug("?" + w); continue; } if (w.equals(word)) { continue; } LOGGER.debug("word:" + w); synonymAntonym.addSynonym(new Word(w, "")); } } if ("???".equals(type)) { String antonym = element.child(i * 2 + 1).text(); LOGGER.debug("antonym:" + antonym); for (String w : antonym.split("\\s+")) { w = w.replaceAll("\\s+", ""); if (w.length() < 2) { continue; } if (isNotChineseChar(w)) { LOGGER.debug("?" + w); continue; } LOGGER.debug("word:" + w); synonymAntonym.addAntonym(new Word(w, "")); } } } } if (!synonymAntonym.getAntonym().isEmpty() || !synonymAntonym.getSynonym().isEmpty()) { LOGGER.info("??????" + synonymAntonym); } } catch (Exception e) { LOGGER.error("??????", e); } return synonymAntonym; } public static void parseSynonymAntonym(List<String> words) { LOGGER.info("??" + words.size()); Set<String> SKIP_WORDS = new ConcurrentSkipListSet<>(); try { if (Files.notExists(CHECKED_WORDS_PATH)) { CHECKED_WORDS_PATH.toFile().createNewFile(); } SKIP_WORDS.addAll(Files.readAllLines(CHECKED_WORDS_PATH)); } catch (Exception e) { LOGGER.error("?", e); } int total = words.size() - SKIP_WORDS.size(); LOGGER.info("????" + SKIP_WORDS.size()); LOGGER.info("??" + total); String url = "http://www.iciba.com/"; AtomicInteger i = new AtomicInteger(); EXECUTOR_SERVICE.submit(() -> { while (true) { try { Thread.sleep(60000); } catch (InterruptedException e) { e.printStackTrace(); } save(); } }); words.parallelStream().forEach(word -> { if (SKIP_WORDS.contains(word)) { return; } LOGGER.info( "" + total + "/" + i.incrementAndGet() + " ?" + Thread.currentThread()); try { word = word.trim(); if ("".equals(word) || isNotChineseChar(word) || word.length() < 2) { return; } String html = getContent(url + word); int times = 1; while (StringUtils.isBlank(html) && times < 3) { times++; //IP? ProxyIp.toNewIp(); html = getContent(url + word); } if (StringUtils.isBlank(html)) { LOGGER.error("??" + url + word); return; } times = 1; //LOGGER.debug("?HTML" +html); while (html.contains("??ip?") && times < 3) { times++; //IP? ProxyIp.toNewIp(); html = getContent(url + word); } SynonymAntonym synonymAntonym = parseSynonymAntonym(html, word); if (!synonymAntonym.getSynonym().isEmpty()) { SYNONYM_MAP.put(synonymAntonym.getWord(), synonymAntonym.getSynonym()); } if (!synonymAntonym.getAntonym().isEmpty()) { StringBuilder str = new StringBuilder(); synonymAntonym.getAntonym().forEach(w -> str.append(w.getWord()).append(" ")); ANTONYM.put(word, str.toString().trim()); } CHECKED_WORDS.add(word); } catch (Exception e) { LOGGER.error("", e); } }); save(); filterSameRecord(CHINESE_SYNONYM); filterSameRecord(CHINESE_ANTONYM); } private static synchronized void save() { System.out.println("?"); List<String> SYNONYM_LIST = null; List<String> ANTONYM_LIST = null; try { if (Files.notExists(CHINESE_SYNONYM)) { CHINESE_SYNONYM.toFile().createNewFile(); } if (Files.notExists(CHINESE_ANTONYM)) { CHINESE_ANTONYM.toFile().createNewFile(); } System.out.println("??" + SYNONYM_MAP.size()); Set<String> SYNONYM_STR = new HashSet<>(); SYNONYM_MAP.keySet().forEach(k -> { StringBuilder str = new StringBuilder(); str.append(k.getWord()).append(" "); SYNONYM_MAP.get(k).stream().sorted().forEach(w -> { str.append(w.getWord()).append(" "); }); SYNONYM_STR.add(str.toString().trim()); }); List<String> existList = Files.readAllLines(CHINESE_SYNONYM); SYNONYM_STR.addAll(existList); SYNONYM_LIST = SYNONYM_STR.stream().sorted().collect(Collectors.toList()); System.out.println("??" + SYNONYM_LIST.size()); Files.write(CHINESE_SYNONYM, SYNONYM_LIST); Set<String> set = ANTONYM.keySet().stream().sorted().map(k -> k + " " + ANTONYM.get(k)) .collect(Collectors.toSet()); existList = Files.readAllLines(CHINESE_ANTONYM); set.addAll(existList); ANTONYM_LIST = set.stream().sorted().collect(Collectors.toList()); System.out.println("???" + ANTONYM_LIST.size()); Files.write(CHINESE_ANTONYM, ANTONYM_LIST); existList = Files.readAllLines(CHECKED_WORDS_PATH); CHECKED_WORDS.addAll(existList); System.out.println("?" + CHECKED_WORDS.size()); Files.write(CHECKED_WORDS_PATH, CHECKED_WORDS); } catch (Exception e) { LOGGER.error("??", SYNONYM_LIST.toString()); LOGGER.error("???", ANTONYM_LIST.toString()); LOGGER.error("?", e); } } public static String getContent(String url) { LOGGER.debug("url:" + url); Connection conn = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Referer", REFERER) .header("Host", HOST) .header("User-Agent", USER_AGENTS.get(uac.incrementAndGet() % USER_AGENTS.size())) .header("X-Forwarded-For", getRandomIp()).header("Proxy-Client-IP", getRandomIp()) .header("WL-Proxy-Client-IP", getRandomIp()).ignoreContentType(true); String html = ""; try { html = conn.post().html(); } catch (Exception e) { if (e instanceof HttpStatusException) { HttpStatusException ex = (HttpStatusException) e; LOGGER.error("error code:" + ex.getStatusCode()); if (ex.getStatusCode() == 404) { return "404"; } } LOGGER.error("?URL" + url + " ?", e); } return html; } public static boolean isNotChineseChar(String str) { boolean temp = false; Pattern p = Pattern.compile("[^\u4e00-\u9fa5]"); Matcher m = p.matcher(str); if (m.find()) { temp = true; } return temp; } public static SynonymAntonym parseSynonymAntonym(String word) { try { return parseSynonymAntonym(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word); } catch (Exception e) { LOGGER.error("??????", e); } return null; } public static String getRandomIp() { int first = new Random().nextInt(254) + 1; //A??0.0.0.0--10.255.255.255 while (first == 10) { first = new Random().nextInt(254) + 1; } int second = new Random().nextInt(254) + 1; //B??172.16.0.0--172.31.255.255 while (first == 172 && (second >= 16 && second <= 31)) { first = new Random().nextInt(254) + 1; second = new Random().nextInt(254) + 1; } //C??192.168.0.0--192.168.255.255 while (first == 192 && second == 168) { first = new Random().nextInt(254) + 1; second = new Random().nextInt(254) + 1; } int third = new Random().nextInt(254) + 1; int forth = new Random().nextInt(254) + 1; return first + "." + second + "." + second + "." + forth; } /** * ?? * ?? * ?? * ??? * @param path */ private static void filterSameRecord(Path path) { try { AtomicInteger i = new AtomicInteger(); Set<String> set = new HashSet<>(); List<String> list = Files.readAllLines(path).stream().filter(line -> { String[] attr = line.split("\\s+"); String words = Arrays.asList(attr).stream().sorted().collect(Collectors.toList()).toString(); if (set.contains(words)) { i.incrementAndGet(); LOGGER.info("??" + line); return false; } set.add(words); return true; }).sorted().collect(Collectors.toList()); Files.write(path, list); LOGGER.info("??" + i.get()); } catch (Exception e) { LOGGER.error("??", e); } } public static void main(String[] args) throws Exception { //parseSynonymAntonym(""); //parseSynonymAntonym("???"); //parseSynonymAntonym(Arrays.asList("", "???")); //System.out.println(getContent("http://www.iciba.com/%E7%83%AD%E7%88%B1")); parseSynonymAntonym(Files.readAllLines(Paths.get("src/main/resources/dic.txt")).stream() .sorted((a, b) -> new Integer(a.length()).compareTo(b.length())).collect(Collectors.toList())); } }