org.apdplat.superword.tools.WordsFetcher.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.superword.tools.WordsFetcher.java

Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * ?
 * @author ??
 */
public class WordsFetcher {
    private WordsFetcher() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(WordsFetcher.class);

    private static final String WORD_CSS_PATH = "html body div#main_block div.word_box form#word_form div.word_main ul li div.word_main_list_w span";
    private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String HOST = "www.iciba.com";
    private static final String REFERER = "http://www.iciba.com/";
    private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";

    /**
     * ?
     */
    public static void updatePrimarySchool() {
        //?
        update(63, 5, "/word_primary_school.txt");
        update(64, 3, "/word_primary_school.txt");
        update(65, 3, "/word_primary_school.txt");
        update(66, 3, "/word_primary_school.txt");
        update(67, 4, "/word_primary_school.txt");
        update(68, 4, "/word_primary_school.txt");
        update(69, 4, "/word_primary_school.txt");
        update(70, 5, "/word_primary_school.txt");
        update(71, 5, "/word_primary_school.txt");
        update(72, 4, "/word_primary_school.txt");
        update(73, 14, "/word_primary_school.txt");
        update(74, 10, "/word_primary_school.txt");
        //?
        update(655, 8, "/word_primary_school.txt");
        update(656, 8, "/word_primary_school.txt");
        update(657, 2, "/word_primary_school.txt");
        update(658, 3, "/word_primary_school.txt");
        update(149, 3, "/word_primary_school.txt");
        update(150, 4, "/word_primary_school.txt");
        update(151, 3, "/word_primary_school.txt");
        update(152, 4, "/word_primary_school.txt");
        update(154, 6, "/word_primary_school.txt");
        update(155, 8, "/word_primary_school.txt");
        update(156, 8, "/word_primary_school.txt");
        //?
        update(265, 2, "/word_primary_school.txt");
        update(266, 3, "/word_primary_school.txt");
        update(267, 1, "/word_primary_school.txt");
        update(268, 2, "/word_primary_school.txt");
        update(269, 1, "/word_primary_school.txt");
        update(271, 2, "/word_primary_school.txt");
        update(272, 3, "/word_primary_school.txt");
    }

    /**
     * ?
     */
    public static void updateJuniorSchool() {
        //?
        update(57, 27, "/word_junior_school.txt");
        update(58, 24, "/word_junior_school.txt");
        update(59, 21, "/word_junior_school.txt");
        update(60, 15, "/word_junior_school.txt");
        update(61, 20, "/word_junior_school.txt");
        update(62, 16, "/word_junior_school.txt");
        //?
        update(105, 30, "/word_junior_school.txt");
        update(106, 20, "/word_junior_school.txt");
        update(107, 28, "/word_junior_school.txt");
        update(108, 25, "/word_junior_school.txt");
        update(109, 37, "/word_junior_school.txt");
        //?
        update(221, 19, "/word_junior_school.txt");
        update(222, 19, "/word_junior_school.txt");
        update(223, 18, "/word_junior_school.txt");
        update(224, 17, "/word_junior_school.txt");
        update(225, 12, "/word_junior_school.txt");
        update(226, 8, "/word_junior_school.txt");
        //?
        update(273, 20, "/word_junior_school.txt");
        update(224, 18, "/word_junior_school.txt");
        update(226, 16, "/word_junior_school.txt");
        update(227, 16, "/word_junior_school.txt");
        update(228, 12, "/word_junior_school.txt");
        update(229, 14, "/word_junior_school.txt");
        //?
        update(728, 18, "/word_junior_school.txt");
        update(729, 25, "/word_junior_school.txt");
        //
        update(678, 17, "/word_junior_school.txt");
    }

    /**
     * 
     */
    public static void updateSeniorSchool() {
        //
        update(51, 19, "/word_senior_school.txt");
        update(52, 25, "/word_senior_school.txt");
        update(53, 24, "/word_senior_school.txt");
        update(54, 20, "/word_senior_school.txt");
        update(55, 25, "/word_senior_school.txt");
        update(56, 23, "/word_senior_school.txt");
        //
        update(110, 14, "/word_senior_school.txt");
        update(111, 14, "/word_senior_school.txt");
        update(112, 19, "/word_senior_school.txt");
        update(113, 15, "/word_senior_school.txt");
        update(114, 18, "/word_senior_school.txt");
        update(118, 20, "/word_senior_school.txt");
        update(119, 19, "/word_senior_school.txt");
        //
        update(139, 5, "/word_senior_school.txt");
        update(140, 194, "/word_senior_school.txt");
    }

    /**
     * 
     */
    public static void updateUniversity() {
        //
        update(45, 27, "/word_university.txt");
        update(46, 37, "/word_university.txt");
        update(47, 40, "/word_university.txt");
        update(48, 46, "/word_university.txt");
        update(49, 25, "/word_university.txt");
        update(50, 65, "/word_university.txt");
    }

    /**
     * 
     */
    public static void updateNewConception() {
        update(41, 41, "/word_new_conception.txt");
        update(42, 49, "/word_new_conception.txt");
        update(43, 81, "/word_new_conception.txt");
        update(44, 76, "/word_new_conception.txt");
    }

    public static void updateCET4() {
        update(11, 226, "/word_CET4.txt");
        update(122, 35, "/word_CET4.txt");
    }

    public static void updateCET6() {
        update(12, 105, "/word_CET6.txt");
        update(123, 25, "/word_CET6.txt");
    }

    public static void updateKY() {
        update(13, 274, "/word_KY.txt");
        update(143, 3, "/word_KY.txt");
    }

    public static void updateTOEFL() {
        update(14, 245, "/word_TOEFL.txt");
    }

    public static void updateIELTS() {
        update(15, 228, "/word_IELTS.txt");
    }

    public static void updateGRE() {
        update(16, 375, "/word_GRE.txt");
    }

    public static void updateGMAT() {
        update(36, 40, "/word_GMAT.txt");
        update(37, 54, "/word_GMAT.txt");
        update(38, 108, "/word_GMAT.txt");
    }

    public static void updateTOEIC() {
        update(682, 42, "/word_TOEIC.txt");
    }

    public static void updateSAT() {
        update(121, 11, "/word_SAT.txt");
    }

    public static void updateBEC() {
        update(680, 47, "/word_BEC.txt");
        update(681, 10, "/word_BEC.txt");
    }

    public static void updateADULT() {
        update(703, 144, "/word_ADULT.txt");
        update(704, 284, "/word_ADULT.txt");
        update(705, 143, "/word_ADULT.txt");
        update(706, 11, "/word_ADULT.txt");
        update(707, 198, "/word_ADULT.txt");
        update(708, 171, "/word_ADULT.txt");
        update(709, 89, "/word_ADULT.txt");
        update(710, 61, "/word_ADULT.txt");
        update(711, 180, "/word_ADULT.txt");
    }

    public static void updateMBA() {
        update(39, 243, "/word_MBA.txt");
    }

    public static void updateTEM4() {
        update(90, 105, "/word_TEM4.txt");
    }

    public static void updateTEM8() {
        update(91, 47, "/word_TEM8.txt");
    }

    public static void updateCATTI() {
        update(715, 70, "/word_CATTI.txt");
        update(716, 35, "/word_CATTI.txt");
        update(717, 94, "/word_CATTI.txt");
    }

    /**
     * ?
     */
    public static void updateComputer() {
        update(78, 191, "/word_computer.txt");
    }

    /**
     * 
     */
    public static void updateOther() {
        //
        update(75, 58, "/words.txt");
        update(76, 46, "/words.txt");
        update(77, 27, "/words.txt");
        //?
        update(79, 118, "/words.txt");
        //?
        update(80, 18, "/words.txt");
        //?
        update(81, 11, "/words.txt");
        //?
        update(97, 34, "/words.txt");
        //?
        update(98, 14, "/words.txt");
        //
        update(147, 92, "/words.txt");
        //?
        update(721, 17, "/words.txt");
        //??
        update(712, 3, "/words.txt");
        update(713, 163, "/words.txt");
        //
        update(363, 29, "/words.txt");
        update(364, 25, "/words.txt");
        update(365, 46, "/words.txt");
        update(366, 50, "/words.txt");
        update(355, 31, "/words.txt");
        //?
        update(362, 59, "/words.txt");
        //
        update(361, 54, "/words.txt");
        update(358, 55, "/words.txt");
        update(359, 33, "/words.txt");
        update(293, 49, "/words.txt");
        update(125, 24, "/words.txt");
        update(125, 24, "/words.txt");
        update(126, 42, "/words.txt");
        update(127, 60, "/words.txt");
        update(128, 109, "/words.txt");
        update(129, 212, "/words.txt");
        update(294, 53, "/words.txt");
        update(725, 122, "/words.txt");
        //
        update(720, 7, "/words.txt");
        update(726, 3, "/words.txt");
        update(676, 19, "/words.txt");
        update(175, 26, "/words.txt");
        update(144, 13, "/words.txt");
        update(145, 19, "/words.txt");
        update(146, 11, "/words.txt");
        update(99, 12, "/words.txt");
        update(87, 2, "/words.txt");
        update(83, 7, "/words.txt");
        update(84, 11, "/words.txt");
        update(85, 6, "/words.txt");
        update(86, 11, "/words.txt");
        update(153, 13, "/words.txt");
    }

    public static void update(int type, int pageNumber, String file) {
        file = "src/main/resources" + file;
        Set<Word> existWords = WordSources.get(file);
        Set<Word> words = fetch(type, pageNumber);
        LOGGER.debug("??" + existWords.size());
        LOGGER.debug("??" + words.size());
        words.addAll(existWords);
        LOGGER.debug("???" + words.size());
        AtomicInteger i = new AtomicInteger();
        List<String> allWords = words.stream().sorted().map(w -> i.incrementAndGet() + "\t" + w.getWord())
                .collect(Collectors.toList());
        try {
            Files.write(Paths.get(file), allWords);
        } catch (Exception e) {
            LOGGER.error("??", e);
        }
    }

    public static Set<Word> fetch(int type, int pageNumber) {
        Set<Word> words = new HashSet<>();
        String url = "http://word.iciba.com/?action=words&class=" + type + "&course=";
        for (int i = 1; i <= pageNumber; i++) {
            String html = getContent(url + i);
            int times = 1;
            while (StringUtils.isBlank(html) && times < 4) {
                times++;
                //IP?
                DynamicIp.toNewIp();
                html = getContent(url + i);
            }
            //LOGGER.debug("?HTML" +html);
            while (html.contains("??ip?")) {
                //IP?
                DynamicIp.toNewIp();
                html = getContent(url + i);
            }
            words.addAll(parse(html));
        }
        LOGGER.debug("url:" + url + "??" + words.size());
        return words;
    }

    public static Set<Word> parse(String html) {
        Set<Word> words = new HashSet<>();
        try {
            for (Element element : Jsoup.parse(html).select(WORD_CSS_PATH)) {
                String word = element.text().trim();
                if (StringUtils.isNotBlank(word) && WordSources.isEnglish(word)) {
                    words.add(new Word(word, ""));
                    LOGGER.debug("???:" + word);
                }
            }
        } catch (Exception e) {
            LOGGER.error("???", e);
        }
        return words;
    }

    public static String getContent(String url) {
        LOGGER.debug("url:" + url);
        Connection conn = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Referer", REFERER)
                .header("Host", HOST).header("User-Agent", USER_AGENT).ignoreContentType(true);
        String html = "";
        try {
            html = conn.post().html();
            html = html.replaceAll("[\n\r]", "");
        } catch (Exception e) {
            LOGGER.error("?URL" + url + "?", e);
        }
        return html;
    }

    public static void main(String[] args) {
        updatePrimarySchool();
        updateJuniorSchool();
        updateSeniorSchool();
        updateUniversity();
        updateNewConception();
        updateCET4();
        updateCET6();
        updateKY();
        updateTOEFL();
        updateIELTS();
        updateGRE();
        updateGMAT();
        updateTOEIC();
        updateSAT();
        updateBEC();
        updateADULT();
        updateMBA();
        updateTEM4();
        updateTEM8();
        updateCATTI();

        updateComputer();
        updateOther();
    }

}