Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * ? * @author ?? */ public class WordsFetcher { private WordsFetcher() { } private static final Logger LOGGER = LoggerFactory.getLogger(WordsFetcher.class); private static final String WORD_CSS_PATH = "html body div#main_block div.word_box form#word_form div.word_main ul li div.word_main_list_w span"; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.iciba.com"; private static final String REFERER = "http://www.iciba.com/"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; /** * ? */ public static void updatePrimarySchool() { //? update(63, 5, "/word_primary_school.txt"); update(64, 3, "/word_primary_school.txt"); update(65, 3, "/word_primary_school.txt"); update(66, 3, "/word_primary_school.txt"); update(67, 4, "/word_primary_school.txt"); update(68, 4, "/word_primary_school.txt"); update(69, 4, "/word_primary_school.txt"); update(70, 5, "/word_primary_school.txt"); update(71, 5, "/word_primary_school.txt"); update(72, 4, "/word_primary_school.txt"); update(73, 14, "/word_primary_school.txt"); update(74, 10, "/word_primary_school.txt"); //? update(655, 8, "/word_primary_school.txt"); update(656, 8, "/word_primary_school.txt"); update(657, 2, "/word_primary_school.txt"); update(658, 3, "/word_primary_school.txt"); update(149, 3, "/word_primary_school.txt"); update(150, 4, "/word_primary_school.txt"); update(151, 3, "/word_primary_school.txt"); update(152, 4, "/word_primary_school.txt"); update(154, 6, "/word_primary_school.txt"); update(155, 8, "/word_primary_school.txt"); update(156, 8, "/word_primary_school.txt"); //? update(265, 2, "/word_primary_school.txt"); update(266, 3, "/word_primary_school.txt"); update(267, 1, "/word_primary_school.txt"); update(268, 2, "/word_primary_school.txt"); update(269, 1, "/word_primary_school.txt"); update(271, 2, "/word_primary_school.txt"); update(272, 3, "/word_primary_school.txt"); } /** * ? */ public static void updateJuniorSchool() { //? update(57, 27, "/word_junior_school.txt"); update(58, 24, "/word_junior_school.txt"); update(59, 21, "/word_junior_school.txt"); update(60, 15, "/word_junior_school.txt"); update(61, 20, "/word_junior_school.txt"); update(62, 16, "/word_junior_school.txt"); //? update(105, 30, "/word_junior_school.txt"); update(106, 20, "/word_junior_school.txt"); update(107, 28, "/word_junior_school.txt"); update(108, 25, "/word_junior_school.txt"); update(109, 37, "/word_junior_school.txt"); //? update(221, 19, "/word_junior_school.txt"); update(222, 19, "/word_junior_school.txt"); update(223, 18, "/word_junior_school.txt"); update(224, 17, "/word_junior_school.txt"); update(225, 12, "/word_junior_school.txt"); update(226, 8, "/word_junior_school.txt"); //? update(273, 20, "/word_junior_school.txt"); update(224, 18, "/word_junior_school.txt"); update(226, 16, "/word_junior_school.txt"); update(227, 16, "/word_junior_school.txt"); update(228, 12, "/word_junior_school.txt"); update(229, 14, "/word_junior_school.txt"); //? update(728, 18, "/word_junior_school.txt"); update(729, 25, "/word_junior_school.txt"); // update(678, 17, "/word_junior_school.txt"); } /** * */ public static void updateSeniorSchool() { // update(51, 19, "/word_senior_school.txt"); update(52, 25, "/word_senior_school.txt"); update(53, 24, "/word_senior_school.txt"); update(54, 20, "/word_senior_school.txt"); update(55, 25, "/word_senior_school.txt"); update(56, 23, "/word_senior_school.txt"); // update(110, 14, "/word_senior_school.txt"); update(111, 14, "/word_senior_school.txt"); update(112, 19, "/word_senior_school.txt"); update(113, 15, "/word_senior_school.txt"); update(114, 18, "/word_senior_school.txt"); update(118, 20, "/word_senior_school.txt"); update(119, 19, "/word_senior_school.txt"); // update(139, 5, "/word_senior_school.txt"); update(140, 194, "/word_senior_school.txt"); } /** * */ public static void updateUniversity() { // update(45, 27, "/word_university.txt"); update(46, 37, "/word_university.txt"); update(47, 40, "/word_university.txt"); update(48, 46, "/word_university.txt"); update(49, 25, "/word_university.txt"); update(50, 65, "/word_university.txt"); } /** * */ public static void updateNewConception() { update(41, 41, "/word_new_conception.txt"); update(42, 49, "/word_new_conception.txt"); update(43, 81, "/word_new_conception.txt"); update(44, 76, "/word_new_conception.txt"); } public static void updateCET4() { update(11, 226, "/word_CET4.txt"); update(122, 35, "/word_CET4.txt"); } public static void updateCET6() { update(12, 105, "/word_CET6.txt"); update(123, 25, "/word_CET6.txt"); } public static void updateKY() { update(13, 274, "/word_KY.txt"); update(143, 3, "/word_KY.txt"); } public static void updateTOEFL() { update(14, 245, "/word_TOEFL.txt"); } public static void updateIELTS() { update(15, 228, "/word_IELTS.txt"); } public static void updateGRE() { update(16, 375, "/word_GRE.txt"); } public static void updateGMAT() { update(36, 40, "/word_GMAT.txt"); update(37, 54, "/word_GMAT.txt"); update(38, 108, "/word_GMAT.txt"); } public static void updateTOEIC() { update(682, 42, "/word_TOEIC.txt"); } public static void updateSAT() { update(121, 11, "/word_SAT.txt"); } public static void updateBEC() { update(680, 47, "/word_BEC.txt"); update(681, 10, "/word_BEC.txt"); } public static void updateADULT() { update(703, 144, "/word_ADULT.txt"); update(704, 284, "/word_ADULT.txt"); update(705, 143, "/word_ADULT.txt"); update(706, 11, "/word_ADULT.txt"); update(707, 198, "/word_ADULT.txt"); update(708, 171, "/word_ADULT.txt"); update(709, 89, "/word_ADULT.txt"); update(710, 61, "/word_ADULT.txt"); update(711, 180, "/word_ADULT.txt"); } public static void updateMBA() { update(39, 243, "/word_MBA.txt"); } public static void updateTEM4() { update(90, 105, "/word_TEM4.txt"); } public static void updateTEM8() { update(91, 47, "/word_TEM8.txt"); } public static void updateCATTI() { update(715, 70, "/word_CATTI.txt"); update(716, 35, "/word_CATTI.txt"); update(717, 94, "/word_CATTI.txt"); } /** * ? */ public static void updateComputer() { update(78, 191, "/word_computer.txt"); } /** * */ public static void updateOther() { // update(75, 58, "/words.txt"); update(76, 46, "/words.txt"); update(77, 27, "/words.txt"); //? update(79, 118, "/words.txt"); //? update(80, 18, "/words.txt"); //? update(81, 11, "/words.txt"); //? update(97, 34, "/words.txt"); //? update(98, 14, "/words.txt"); // update(147, 92, "/words.txt"); //? update(721, 17, "/words.txt"); //?? update(712, 3, "/words.txt"); update(713, 163, "/words.txt"); // update(363, 29, "/words.txt"); update(364, 25, "/words.txt"); update(365, 46, "/words.txt"); update(366, 50, "/words.txt"); update(355, 31, "/words.txt"); //? update(362, 59, "/words.txt"); // update(361, 54, "/words.txt"); update(358, 55, "/words.txt"); update(359, 33, "/words.txt"); update(293, 49, "/words.txt"); update(125, 24, "/words.txt"); update(125, 24, "/words.txt"); update(126, 42, "/words.txt"); update(127, 60, "/words.txt"); update(128, 109, "/words.txt"); update(129, 212, "/words.txt"); update(294, 53, "/words.txt"); update(725, 122, "/words.txt"); // update(720, 7, "/words.txt"); update(726, 3, "/words.txt"); update(676, 19, "/words.txt"); update(175, 26, "/words.txt"); update(144, 13, "/words.txt"); update(145, 19, "/words.txt"); update(146, 11, "/words.txt"); update(99, 12, "/words.txt"); update(87, 2, "/words.txt"); update(83, 7, "/words.txt"); update(84, 11, "/words.txt"); update(85, 6, "/words.txt"); update(86, 11, "/words.txt"); update(153, 13, "/words.txt"); } public static void update(int type, int pageNumber, String file) { file = "src/main/resources" + file; Set<Word> existWords = WordSources.get(file); Set<Word> words = fetch(type, pageNumber); LOGGER.debug("??" + existWords.size()); LOGGER.debug("??" + words.size()); words.addAll(existWords); LOGGER.debug("???" + words.size()); AtomicInteger i = new AtomicInteger(); List<String> allWords = words.stream().sorted().map(w -> i.incrementAndGet() + "\t" + w.getWord()) .collect(Collectors.toList()); try { Files.write(Paths.get(file), allWords); } catch (Exception e) { LOGGER.error("??", e); } } public static Set<Word> fetch(int type, int pageNumber) { Set<Word> words = new HashSet<>(); String url = "http://word.iciba.com/?action=words&class=" + type + "&course="; for (int i = 1; i <= pageNumber; i++) { String html = getContent(url + i); int times = 1; while (StringUtils.isBlank(html) && times < 4) { times++; //IP? DynamicIp.toNewIp(); html = getContent(url + i); } //LOGGER.debug("?HTML" +html); while (html.contains("??ip?")) { //IP? DynamicIp.toNewIp(); html = getContent(url + i); } words.addAll(parse(html)); } LOGGER.debug("url:" + url + "??" + words.size()); return words; } public static Set<Word> parse(String html) { Set<Word> words = new HashSet<>(); try { for (Element element : Jsoup.parse(html).select(WORD_CSS_PATH)) { String word = element.text().trim(); if (StringUtils.isNotBlank(word) && WordSources.isEnglish(word)) { words.add(new Word(word, "")); LOGGER.debug("???:" + word); } } } catch (Exception e) { LOGGER.error("???", e); } return words; } public static String getContent(String url) { LOGGER.debug("url:" + url); Connection conn = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Referer", REFERER) .header("Host", HOST).header("User-Agent", USER_AGENT).ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); } catch (Exception e) { LOGGER.error("?URL" + url + "?", e); } return html; } public static void main(String[] args) { updatePrimarySchool(); updateJuniorSchool(); updateSeniorSchool(); updateUniversity(); updateNewConception(); updateCET4(); updateCET6(); updateKY(); updateTOEFL(); updateIELTS(); updateGRE(); updateGMAT(); updateTOEIC(); updateSAT(); updateBEC(); updateADULT(); updateMBA(); updateTEM4(); updateTEM8(); updateCATTI(); updateComputer(); updateOther(); } }