org.apdplat.superword.extract.SynonymAntonymExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.superword.extract.SynonymAntonymExtractor.java

Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.extract;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.SynonymAntonym;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.HtmlFormatter;
import org.apdplat.superword.tools.WordClassifier;
import org.apdplat.superword.tools.WordSources;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.stream.Collectors;

/**
 * ????????
 * @author ??
 */
public class SynonymAntonymExtractor {

    private SynonymAntonymExtractor() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(SynonymAntonymExtractor.class);
    private static final String SYNONYM_ANTONYM_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_3.dict_content.tongyi div.industry_box div.industry";
    private static final String TYPE = "h4";
    private static final String WORDS = "ul dl dd a";

    public static Set<SynonymAntonym> parse(String path) {
        if (path.endsWith(".zip")) {
            return parseZip(path);
        }
        if (Files.isDirectory(Paths.get(path))) {
            return parseDir(path);
        } else {
            return parseFile(path);
        }
    }

    public static Set<SynonymAntonym> parseDir(String dir) {
        Set<SynonymAntonym> data = new HashSet<>();
        LOGGER.info("?" + dir);
        try {
            Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    data.addAll(parseFile(file.toFile().getAbsolutePath()));
                    return FileVisitResult.CONTINUE;
                }

            });
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<SynonymAntonym> parseZip(String zipFile) {
        Set<SynonymAntonym> data = new HashSet<>();
        LOGGER.info("?ZIP" + zipFile);
        try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
            for (Path path : fs.getRootDirectories()) {
                LOGGER.info("?" + path);
                Files.walkFileTree(path, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        LOGGER.info("?" + file);
                        // ?
                        Path temp = Paths.get("target/origin-html-temp.txt");
                        Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
                        data.addAll(parseFile(temp.toFile().getAbsolutePath()));
                        return FileVisitResult.CONTINUE;
                    }

                });
            }
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<SynonymAntonym> parseFile(String file) {
        Set<SynonymAntonym> data = new HashSet<>();
        LOGGER.info("?" + file);
        try (BufferedReader reader = new BufferedReader(
                new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) {
            String line = null;
            while ((line = reader.readLine()) != null) {
                //LOGGER.debug("html:"+line);
                String[] attr = line.split("??");
                if (attr == null || attr.length != 2) {
                    LOGGER.error(
                            "?'??'???????"
                                    + line);
                    continue;
                }
                String word = attr[0];
                LOGGER.info("???" + word);
                String html = attr[1];
                SynonymAntonym sa = parseSynonymAntonym(html, word);
                if (sa.valid()) {
                    data.add(sa);
                }
            }
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    /**
     * ????
     * @param html
     * @return
     */
    public static SynonymAntonym parseSynonymAntonym(String html, String word) {
        SynonymAntonym synonymAntonym = new SynonymAntonym();
        synonymAntonym.setWord(new Word(word, ""));
        try {
            for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) {
                String type = element.select(TYPE).text().trim();
                LOGGER.debug("type:" + type);
                Elements elements = element.select(WORDS);
                for (Element ele : elements) {
                    String w = ele.text().trim();
                    LOGGER.debug("word:" + w);
                    if (StringUtils.isNotBlank(w)) {
                        switch (type) {
                        case "??":
                            synonymAntonym.addSynonym(new Word(w, ""));
                            break;
                        case "???":
                            synonymAntonym.addAntonym(new Word(w, ""));
                            break;
                        default:
                            LOGGER.error("???????" + type);
                        }
                    } else {
                        LOGGER.error("??????" + word);
                    }
                }
            }
            LOGGER.info("??????" + synonymAntonym);
        } catch (Exception e) {
            LOGGER.error("??????", e);
        }
        return synonymAntonym;
    }

    private static Set<SynonymAntonym> inSyllabusVocabulary(Set<SynonymAntonym> synonymAntonyms) {
        Set<Word> voc = WordSources.getSyllabusVocabulary();
        return synonymAntonyms.stream().filter(sa -> voc.contains(sa.getWord())).collect(Collectors.toSet());
    }

    private static Set<SynonymAntonym> notInSyllabusVocabulary(Set<SynonymAntonym> synonymAntonyms) {
        Set<Word> voc = WordSources.getSyllabusVocabulary();
        return synonymAntonyms.stream().filter(sa -> !voc.contains(sa.getWord())).collect(Collectors.toSet());
    }

    private static void parseSynonymAntonym() {
        Set<SynonymAntonym> synonymAntonyms = parse("/Users/apple/?/origin_html.zip");
        String inSyllabusVocabularyHtml = HtmlFormatter
                .toHtmlForSynonymAntonym(inSyllabusVocabulary(synonymAntonyms), 6);
        String notInSyllabusVocabularyHtml = HtmlFormatter
                .toHtmlForSynonymAntonym(notInSyllabusVocabulary(synonymAntonyms), 6);
        try {
            Files.write(Paths.get("src/main/resources/synonym_antonym_in_syllabus_vocabulary.txt"),
                    inSyllabusVocabularyHtml.getBytes("utf-8"));
            Files.write(Paths.get("src/main/resources/synonym_antonym_not_in_syllabus_vocabulary.txt"),
                    notInSyllabusVocabularyHtml.getBytes("utf-8"));
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    private static void parseAntonym() {
        Set<SynonymAntonym> antonyms = parse("/Users/apple/?/origin_html.zip");
        String inSyllabusVocabularyHtml = HtmlFormatter.toHtmlForAntonym(inSyllabusVocabulary(antonyms), 6);
        String notInSyllabusVocabularyHtml = HtmlFormatter.toHtmlForAntonym(notInSyllabusVocabulary(antonyms), 6);
        try {
            Files.write(Paths.get("src/main/resources/antonym_in_syllabus_vocabulary.txt"),
                    inSyllabusVocabularyHtml.getBytes("utf-8"));
            Files.write(Paths.get("src/main/resources/antonym_not_in_syllabus_vocabulary.txt"),
                    notInSyllabusVocabularyHtml.getBytes("utf-8"));
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public static SynonymAntonym parseSynonymAntonym(String word) {
        try {
            return parseSynonymAntonym(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word);
        } catch (Exception e) {
            LOGGER.error("??????", e);
        }
        return null;
    }

    public static void main(String[] args) {
        //parseSynonymAntonym("back");
        //parseSynonymAntonym();
        parseAntonym();
    }
}