Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.extract; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.SynonymAntonym; import org.apdplat.superword.model.Word; import org.apdplat.superword.tools.HtmlFormatter; import org.apdplat.superword.tools.WordClassifier; import org.apdplat.superword.tools.WordSources; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URL; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.stream.Collectors; /** * ???????? * @author ?? */ public class SynonymAntonymExtractor { private SynonymAntonymExtractor() { } private static final Logger LOGGER = LoggerFactory.getLogger(SynonymAntonymExtractor.class); private static final String SYNONYM_ANTONYM_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_3.dict_content.tongyi div.industry_box div.industry"; private static final String TYPE = "h4"; private static final String WORDS = "ul dl dd a"; public static Set<SynonymAntonym> parse(String path) { if (path.endsWith(".zip")) { return parseZip(path); } if (Files.isDirectory(Paths.get(path))) { return parseDir(path); } else { return parseFile(path); } } public static Set<SynonymAntonym> parseDir(String dir) { Set<SynonymAntonym> data = new HashSet<>(); LOGGER.info("?" + dir); try { Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { data.addAll(parseFile(file.toFile().getAbsolutePath())); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { LOGGER.error("?", e); } return data; } public static Set<SynonymAntonym> parseZip(String zipFile) { Set<SynonymAntonym> data = new HashSet<>(); LOGGER.info("?ZIP" + zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { for (Path path : fs.getRootDirectories()) { LOGGER.info("?" + path); Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("?" + file); // ? Path temp = Paths.get("target/origin-html-temp.txt"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); data.addAll(parseFile(temp.toFile().getAbsolutePath())); return FileVisitResult.CONTINUE; } }); } } catch (Exception e) { LOGGER.error("?", e); } return data; } public static Set<SynonymAntonym> parseFile(String file) { Set<SynonymAntonym> data = new HashSet<>(); LOGGER.info("?" + file); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) { String line = null; while ((line = reader.readLine()) != null) { //LOGGER.debug("html:"+line); String[] attr = line.split("??"); if (attr == null || attr.length != 2) { LOGGER.error( "?'??'???????" + line); continue; } String word = attr[0]; LOGGER.info("???" + word); String html = attr[1]; SynonymAntonym sa = parseSynonymAntonym(html, word); if (sa.valid()) { data.add(sa); } } } catch (IOException e) { LOGGER.error("?", e); } return data; } /** * ???? * @param html * @return */ public static SynonymAntonym parseSynonymAntonym(String html, String word) { SynonymAntonym synonymAntonym = new SynonymAntonym(); synonymAntonym.setWord(new Word(word, "")); try { for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) { String type = element.select(TYPE).text().trim(); LOGGER.debug("type:" + type); Elements elements = element.select(WORDS); for (Element ele : elements) { String w = ele.text().trim(); LOGGER.debug("word:" + w); if (StringUtils.isNotBlank(w)) { switch (type) { case "??": synonymAntonym.addSynonym(new Word(w, "")); break; case "???": synonymAntonym.addAntonym(new Word(w, "")); break; default: LOGGER.error("???????" + type); } } else { LOGGER.error("??????" + word); } } } LOGGER.info("??????" + synonymAntonym); } catch (Exception e) { LOGGER.error("??????", e); } return synonymAntonym; } private static Set<SynonymAntonym> inSyllabusVocabulary(Set<SynonymAntonym> synonymAntonyms) { Set<Word> voc = WordSources.getSyllabusVocabulary(); return synonymAntonyms.stream().filter(sa -> voc.contains(sa.getWord())).collect(Collectors.toSet()); } private static Set<SynonymAntonym> notInSyllabusVocabulary(Set<SynonymAntonym> synonymAntonyms) { Set<Word> voc = WordSources.getSyllabusVocabulary(); return synonymAntonyms.stream().filter(sa -> !voc.contains(sa.getWord())).collect(Collectors.toSet()); } private static void parseSynonymAntonym() { Set<SynonymAntonym> synonymAntonyms = parse("/Users/apple/?/origin_html.zip"); String inSyllabusVocabularyHtml = HtmlFormatter .toHtmlForSynonymAntonym(inSyllabusVocabulary(synonymAntonyms), 6); String notInSyllabusVocabularyHtml = HtmlFormatter .toHtmlForSynonymAntonym(notInSyllabusVocabulary(synonymAntonyms), 6); try { Files.write(Paths.get("src/main/resources/synonym_antonym_in_syllabus_vocabulary.txt"), inSyllabusVocabularyHtml.getBytes("utf-8")); Files.write(Paths.get("src/main/resources/synonym_antonym_not_in_syllabus_vocabulary.txt"), notInSyllabusVocabularyHtml.getBytes("utf-8")); } catch (Exception e) { LOGGER.error(e.getMessage(), e); } } private static void parseAntonym() { Set<SynonymAntonym> antonyms = parse("/Users/apple/?/origin_html.zip"); String inSyllabusVocabularyHtml = HtmlFormatter.toHtmlForAntonym(inSyllabusVocabulary(antonyms), 6); String notInSyllabusVocabularyHtml = HtmlFormatter.toHtmlForAntonym(notInSyllabusVocabulary(antonyms), 6); try { Files.write(Paths.get("src/main/resources/antonym_in_syllabus_vocabulary.txt"), inSyllabusVocabularyHtml.getBytes("utf-8")); Files.write(Paths.get("src/main/resources/antonym_not_in_syllabus_vocabulary.txt"), notInSyllabusVocabularyHtml.getBytes("utf-8")); } catch (Exception e) { LOGGER.error(e.getMessage(), e); } } public static SynonymAntonym parseSynonymAntonym(String word) { try { return parseSynonymAntonym(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word); } catch (Exception e) { LOGGER.error("??????", e); } return null; } public static void main(String[] args) { //parseSynonymAntonym("back"); //parseSynonymAntonym(); parseAntonym(); } }