Java tutorial
/* * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.FileSystem; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * * ?????? * ???33376???HTML?origin_html_oxford.zip * ?http://pan.baidu.com/s/1pJmwr95 * @author ?? */ public class WordClassifierForOxford { private WordClassifierForOxford() { } private static final Logger LOGGER = LoggerFactory.getLogger(WordClassifierForOxford.class); private static final String OXFORD = WordLinker.OXFORD; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.oxforddictionaries.com"; private static final String REFERER = "http://www.oxforddictionaries.com/"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; private static final Set<String> NOT_FOUND_WORDS = Collections.newSetFromMap(new ConcurrentHashMap<>()); private static final Set<String> ORIGIN_HTML = Collections.newSetFromMap(new ConcurrentHashMap<>()); private static final AtomicInteger COUNT = new AtomicInteger(); public static void download(Set<Word> words) { LOGGER.debug("??" + words.size()); AtomicInteger i = new AtomicInteger(); words.parallelStream().forEach(word -> { if (i.get() % 1000 == 999) { save(); } showStatus(i.incrementAndGet(), words.size(), word.getWord()); String html = getContent(word.getWord()); //LOGGER.debug("?HTML" +html); if (StringUtils.isNotBlank(html)) { html = word.getWord() + "??" + html; parseHtml(html); if (!NOT_FOUND_WORDS.contains(word.getWord())) { ORIGIN_HTML.add(html); } } else { NOT_FOUND_WORDS.add(word.getWord()); } }); //? save(); LOGGER.debug("??" + words.size()); } public static void parse(String path) { if (path.endsWith(".zip")) { parseZip(path); } else if (Files.isDirectory(Paths.get(path))) { parseDir(path); } else { parseFile(path); } } public static void parseDir(String dir) { LOGGER.info("?" + dir); try { Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { parseFile(file.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { LOGGER.error("?", e); } } public static void parseZip(String zipFile) { LOGGER.info("?ZIP" + zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifierForOxford.class.getClassLoader())) { for (Path path : fs.getRootDirectories()) { LOGGER.info("?" + path); Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("?" + file); // ? Path temp = Paths.get("target/origin-html-temp.txt"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); parseFile(temp.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } } catch (Exception e) { LOGGER.error("?", e); } } public static void parseFile(String file) { LOGGER.info("?" + file); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) { String line = null; while ((line = reader.readLine()) != null) { parseHtml(line); } } catch (IOException e) { LOGGER.error("?", e); } } public static void parseHtml(String html) { LOGGER.debug("html:" + html); String[] attr = html.split("??"); if (attr == null || attr.length != 2) { LOGGER.error( "?'??'???????" + html); return; } String word = attr[0]; String htm = attr[1]; parse(word, htm); } public static void showStatus(int current, int total, String word) { LOGGER.debug("?? " + current + "/" + total + " ? " + current / (float) total * 100 + "% " + word); } public static synchronized void save() { LOGGER.info("??"); try { if (!NOT_FOUND_WORDS.isEmpty()) { String path = "src/main/resources/word_not_found.txt"; LOGGER.info("??" + path); AtomicInteger i = new AtomicInteger(); //NOT_FOUND_WORDS List<String> list = NOT_FOUND_WORDS.stream().sorted().map(word -> i.incrementAndGet() + "\t" + word) .collect(Collectors.toList()); Files.write(Paths.get(path), list); list.clear(); } //?HTML if (!ORIGIN_HTML.isEmpty()) { String path = "src/main/resources/origin_html_" + System.currentTimeMillis() + ".txt"; LOGGER.info("??" + path); Files.write(Paths.get(path), ORIGIN_HTML); ORIGIN_HTML.clear(); } } catch (Exception e) { LOGGER.error("??", e); } } public static String getContent(String word) { String url = OXFORD + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999) + 100000); LOGGER.debug("url:" + url); Connection conn = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Referer", REFERER) .header("Host", HOST).header("User-Agent", USER_AGENT).timeout(60000).ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); } catch (Exception e) { //LOGGER.error("?URL"+url+"?", e); LOGGER.error("?URL" + url + "?"); } return html; } public static void parse(String word, String html) { String wordDefinition = MySQLUtils.getWordDefinition(word, WordLinker.Dictionary.OXFORD.name()); if (StringUtils.isNotBlank(wordDefinition)) { return; } List<String> list = Definition.parseDefinitionFromHtml(html, null, word, WordLinker.Dictionary.OXFORD); if (!list.isEmpty()) { LOGGER.info(COUNT.incrementAndGet() + "?????" + word); list.stream().forEach(System.out::println); MySQLUtils.saveWordDefinition(word, WordLinker.Dictionary.OXFORD.name(), Definition.concat(list, "<br/>")); } else { NOT_FOUND_WORDS.add(word); } } public static void main(String[] args) { //Set<Word> words = new HashSet<>(); //words.add(new Word("time", "")); //words.add(new Word("yangshangchuan", "")); //download(words); //download(WordSources.getAll()); //parse("src/main/resources/origin_html_1449054801557.txt"); //origin_html_oxford.zip?33376????HTML??http://pan.baidu.com/s/1pJmwr95 parse("/Users/apple/?/origin_html_oxford.zip"); } }