org.apdplat.superword.extract.PartOfSpeechExtractor.java Source code

Introduction

Here is the source code for org.apdplat.superword.extract.PartOfSpeechExtractor.java
Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.extract;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.rule.PartOfSpeech;
import org.apdplat.superword.tools.HtmlFormatter;
import org.apdplat.superword.tools.WordClassifier;
import org.apdplat.superword.tools.WordSources;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.nio.file.FileSystem;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * ????
 * @author ??
 */
public class PartOfSpeechExtractor {

    private PartOfSpeechExtractor() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(PartOfSpeechExtractor.class);
    private static final String PART_OF_SPEECH_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.collins div#dict_tab_101.tab_content.tab_authorities div.part_main div.collins_content div.collins_en_cn div.caption span.st";

    public static Set<Word> parse(String path) {
        if (path.endsWith(".zip")) {
            return parseZip(path);
        }
        if (Files.isDirectory(Paths.get(path))) {
            return parseDir(path);
        } else {
            return parseFile(path);
        }
    }

    public static Set<Word> parseDir(String dir) {
        Set<Word> data = new HashSet<>();
        LOGGER.info("?" + dir);
        try {
            Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    data.addAll(parseFile(file.toFile().getAbsolutePath()));
                    return FileVisitResult.CONTINUE;
                }

            });
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<Word> parseZip(String zipFile) {
        Set<Word> data = new HashSet<>();
        LOGGER.info("?ZIP" + zipFile);
        try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
            for (Path path : fs.getRootDirectories()) {
                LOGGER.info("?" + path);
                Files.walkFileTree(path, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        LOGGER.info("?" + file);
                        // ?
                        Path temp = Paths.get("target/origin-html-temp.txt");
                        Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
                        data.addAll(parseFile(temp.toFile().getAbsolutePath()));
                        return FileVisitResult.CONTINUE;
                    }

                });
            }
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<Word> parseFile(String file) {
        Set<Word> data = new HashSet<>();
        LOGGER.info("?" + file);
        try (BufferedReader reader = new BufferedReader(
                new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) {
            String line = null;
            while ((line = reader.readLine()) != null) {
                //LOGGER.debug("html:"+line);
                String[] attr = line.split("??");
                if (attr == null || attr.length != 2) {
                    LOGGER.error(
                            "?'??'???????"
                                    + line);
                    continue;
                }
                String word = attr[0];
                String html = attr[1];
                Word w = parseWord(html, word);
                if (w != null && !w.getPartOfSpeeches().isEmpty()) {
                    data.add(w);
                }
            }
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    /**
     * ??
     * @param html
     * @return
     */
    public static Word parseWord(String html, String word) {
        LOGGER.info("???" + word);
        Word w = new Word(word, "");
        try {
            for (Element element : Jsoup.parse(html).select(PART_OF_SPEECH_CSS_PATH)) {
                String partOfSpeech = element.text();
                LOGGER.debug("??:" + partOfSpeech);
                if (StringUtils.isNotBlank(partOfSpeech) && !partOfSpeech.contains("See also")) {
                    partOfSpeech = partOfSpeech.replace(";", "")
                            //???
                            .replace("COMB in ADJ and N-COUNT", "COMB-in-ADJ-and-N-COUNT")
                            .replace("COMB in ADJ and N", "COMB-in-ADJ-and-N").replace("COMB in ADJ", "COMB-in-ADJ")
                            .replace("COMB in ADJ-GRADED", "COMB-in-ADJ-GRADED")
                            .replace("COMB in N-COUNT", "COMB-in-N-COUNT")
                            .replace("COMB in COLOUR", "COMB-in-COLOUR").replace("COMB in N", "COMB-in-N")
                            .replace("COMB in N-UNCOUNT", "COMB-in-N-UNCOUNT")
                            .replace("COMB in QUANT", "COMB-in-QUANT").replace("COMB in VERB", "COMB-in-VERB");
                    String[] attrs = partOfSpeech.split("\\s+");
                    for (String attr : attrs) {
                        if (attr.length() < 1) {
                            LOGGER.debug("?:" + attr);
                            continue;
                        }
                        //??
                        if (attr.contains("PHR")) {
                            LOGGER.debug(":" + attr);
                            continue;
                        }
                        attr = attr.replace(",", "");
                        char c = attr.charAt(0);
                        if (c >= 'A' && c <= 'Z') {
                            if ("VERB".equals(attr)) {
                                attr = "V";
                            }
                            if ("VERB-ERG".equals(attr)) {
                                attr = "V-ERG";
                            }
                            w.addPartOfSpeech(attr);
                            LOGGER.debug("??:" + attr);
                        }
                    }
                }
            }
        } catch (Exception e) {
            LOGGER.error("??", e);
        }
        return w;
    }

    private static Set<Word> inSyllabusVocabulary(Set<Word> words) {
        Set<Word> voc = WordSources.getSyllabusVocabulary();
        return words.stream().filter(w -> voc.contains(w)).collect(Collectors.toSet());
    }

    private static Set<Word> notInSyllabusVocabulary(Set<Word> words) {
        Set<Word> voc = WordSources.getSyllabusVocabulary();
        return words.stream().filter(w -> !voc.contains(w)).collect(Collectors.toSet());
    }

    private static void parseWord() {
        Set<Word> words = parse("/Users/apple/?/origin_html.zip");
        Set<Word> inSyllabusVocabulary = inSyllabusVocabulary(words);
        compensate(inSyllabusVocabulary);

        String inSyllabusVocabularyText = formatPartOfSpeech(inSyllabusVocabulary);
        Set<Word> notInSyllabusVocabulary = notInSyllabusVocabulary(words);
        String notInSyllabusVocabularyText = formatPartOfSpeech(notInSyllabusVocabulary);
        LOGGER.info(formatPartOfSpeechType(words));
        try {
            Files.write(Paths.get("src/main/resources/part_of_speech_in_syllabus_vocabulary.txt"),
                    inSyllabusVocabularyText.getBytes("utf-8"));
            Files.write(Paths.get("src/main/resources/part_of_speech_not_in_syllabus_vocabulary.txt"),
                    notInSyllabusVocabularyText.getBytes("utf-8"));
            Files.write(Paths.get("src/main/resources/group_part_of_speech_in_syllabus_vocabulary.txt"),
                    HtmlFormatter.toHtmlForPartOfSpeech(group(inSyllabusVocabulary)).getBytes("utf-8"));
            Files.write(Paths.get("src/main/resources/group_part_of_speech_not_in_syllabus_vocabulary.txt"),
                    HtmlFormatter.toHtmlForPartOfSpeech(group(notInSyllabusVocabulary)).getBytes("utf-8"));
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    private static Map<String, Set<String>> group(Set<Word> words) {
        Map<String, Set<String>> data = new HashMap<>();
        words.forEach(w -> {
            w.getPartOfSpeeches().forEach(pos -> {
                data.putIfAbsent(pos, new HashSet<>());
                data.get(pos).add(w.getWord());
            });
        });
        return data;
    }

    private static String formatPartOfSpeech(Set<Word> words) {
        StringBuilder text = new StringBuilder();
        AtomicInteger i = new AtomicInteger();
        words.forEach(w -> text.append(i.incrementAndGet()).append("\t").append(w.getWord()).append("\t")
                .append(w.getFormatPartOfSpeeches()).append("\n"));
        text.append(formatPartOfSpeechType(words));
        return text.toString();
    }

    private static String formatPartOfSpeechType(Set<Word> words) {
        StringBuilder text = new StringBuilder();
        Set<String> ps = new HashSet<>();
        words.forEach(w -> {
            ps.addAll(w.getPartOfSpeeches());
        });
        text.append("#??(").append(ps.size()).append(")").append("\n");
        ps.forEach(p -> text.append("#").append(p).append("=").append(PartOfSpeech.getMeaning(p)).append("\n"));
        return text.toString();
    }

    public static void compensate(Set<Word> words) {
        Set<Word> minus = WordSources.minus(WordSources.getSyllabusVocabulary(), words);
        LOGGER.debug("?" + minus.size());
        minus.forEach(w -> {
            LOGGER.debug(w.getWord());
            Word word = parseWord(w.getWord());
            if (word != null && !word.getPartOfSpeeches().isEmpty()) {
                words.add(word);
            }
        });
    }

    public static Word parseWord(String word) {
        try {
            return parseWord(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word);
        } catch (Exception e) {
            LOGGER.error("??", e);
        }
        return null;
    }

    public static void main(String[] args) {
        //parseWord("up");
        //parseWord("like");
        //parseWord("nothing");
        parseWord();
    }
}