org.apdplat.superword.extract.PhraseExtractor.java Source code

Introduction

Here is the source code for org.apdplat.superword.extract.PhraseExtractor.java
Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.extract;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.tools.HtmlFormatter;
import org.apdplat.superword.tools.WordLinker;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.nio.file.FileSystem;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * ????
 * @author ??
 */
public class PhraseExtractor {

    private PhraseExtractor() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(PhraseExtractor.class);
    private static final String PHRASE_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_2.dict_content.word_group dl.def_list dd.dd_show h4.cx_mean_switch";

    public static Set<String> parse(String path) {
        if (path.endsWith(".zip")) {
            return parseZip(path);
        }
        if (Files.isDirectory(Paths.get(path))) {
            return parseDir(path);
        } else {
            return parseFile(path);
        }
    }

    public static Set<String> parseDir(String dir) {
        Set<String> data = new HashSet<>();
        LOGGER.info("?" + dir);
        try {
            Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    data.addAll(parseFile(file.toFile().getAbsolutePath()));
                    return FileVisitResult.CONTINUE;
                }

            });
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<String> parseZip(String zipFile) {
        Set<String> data = new HashSet<>();
        LOGGER.info("?ZIP" + zipFile);
        try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile),
                PhraseExtractor.class.getClassLoader())) {
            for (Path path : fs.getRootDirectories()) {
                LOGGER.info("?" + path);
                Files.walkFileTree(path, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        LOGGER.info("?" + file);
                        // ?
                        Path temp = Paths.get("target/origin-html-temp.txt");
                        Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
                        data.addAll(parseFile(temp.toFile().getAbsolutePath()));
                        return FileVisitResult.CONTINUE;
                    }

                });
            }
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    public static Set<String> parseFile(String file) {
        Set<String> data = new HashSet<>();
        LOGGER.info("?" + file);
        try (BufferedReader reader = new BufferedReader(
                new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) {
            String line = null;
            while ((line = reader.readLine()) != null) {
                //LOGGER.debug("html:"+line);
                String[] attr = line.split("??");
                if (attr == null || attr.length != 2) {
                    LOGGER.error(
                            "?'??'???????"
                                    + line);
                    continue;
                }
                String word = attr[0];
                String html = attr[1];
                Set<String> set = parsePhrase(html, word);
                data.addAll(set);
            }
        } catch (IOException e) {
            LOGGER.error("?", e);
        }
        return data;
    }

    /**
     * ??
     * @param html
     * @return
     */
    public static Set<String> parsePhrase(String html, String word) {
        Set<String> phrases = new HashSet<>();
        LOGGER.info("???" + word);
        if (Character.isUpperCase(word.charAt(0))) {
            LOGGER.info("???");
            return phrases;
        }
        try {
            o: for (Element element : Jsoup.parse(html).select(PHRASE_CSS_PATH)) {
                String phrase = element.text().trim();
                if (StringUtils.isNotBlank(phrase)) {
                    if (phrase.length() >= 50) {
                        LOGGER.debug(":" + phrase);
                        break o;
                    }
                    String[] attrs = phrase.split("\\s+");
                    if (attrs == null || attrs.length < 2) {
                        LOGGER.debug(":" + phrase);
                        break o;
                    }
                    for (String attr : attrs) {
                        for (char c : attr.toCharArray()) {
                            if (!(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) {
                                LOGGER.debug(":" + phrase);
                                break o;
                            }
                        }
                    }
                    phrases.add(phrase);
                    LOGGER.debug("?:" + phrase);
                }
            }
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        return phrases;
    }

    private static void parsePhrase() {
        Set<String> parses = parse("/Users/apple/?/origin_html.zip");
        List<String> ps = parses.stream().sorted().map(p -> WordLinker.toLink(p)).collect(Collectors.toList());
        String html = HtmlFormatter.toHtmlTableFragment(ps, 2);
        try {
            Files.write(Paths.get("src/main/resources/phrases.txt"), html.getBytes("utf-8"));
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public static Set<String> parsePhrase(String String) {
        try {
            return parsePhrase(Jsoup.parse(new URL("http://www.iciba.com/" + String), 15000).html(), String);
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        return null;
    }

    public static void main(String[] args) {
        //parsePhrase("up");
        //parsePhrase("like");
        //parsePhrase("nothing");
        parsePhrase();
    }
}