Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * ???? * ?????? * ?????61821???HTML?origin_html.zip * ?http://pan.baidu.com/s/1bnD9gy7 * @author ?? */ public class WordClassifier { private WordClassifier() { } private static final Logger LOGGER = LoggerFactory.getLogger(WordClassifier.class); private static final String ICIBA = "http://www.iciba.com/"; private static final String TYPE_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.dictbar div.wd_genre a"; private static final String UNFOUND_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div#question.question.unfound_tips"; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.iciba.com"; private static final String REFERER = "http://www.iciba.com/"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; private static final Set<String> NOT_FOUND_WORDS = new HashSet<>(); private static final Set<String> ORIGIN_HTML = new HashSet<>(); public static void classify(Set<Word> words) { LOGGER.debug("??" + words.size()); AtomicInteger i = new AtomicInteger(); Map<String, List<String>> data = new HashMap<>(); words.forEach(word -> { if (i.get() % 1000 == 999) { save(data); } showStatus(data, i.incrementAndGet(), words.size(), word.getWord()); String html = getContent(word.getWord()); //LOGGER.debug("?HTML" +html); while (html.contains("??ip?")) { //IP? DynamicIp.toNewIp(); html = getContent(word.getWord()); } if (StringUtils.isNotBlank(html)) { parse(word.getWord(), html, data); if (!NOT_FOUND_WORDS.contains(word.getWord())) { ORIGIN_HTML.add(word.getWord() + "??" + html); } } else { NOT_FOUND_WORDS.add(word.getWord()); } }); //? save(data); LOGGER.debug("??" + words.size()); } public static void parse(String path) { if (path.endsWith(".zip")) { parseZip(path); } if (Files.isDirectory(Paths.get(path))) { parseDir(path); } else { parseFile(path); } } public static void parseDir(String dir) { LOGGER.info("?" + dir); try { Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { parseFile(file.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { LOGGER.error("?", e); } } public static void parseZip(String zipFile) { LOGGER.info("?ZIP" + zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { for (Path path : fs.getRootDirectories()) { LOGGER.info("?" + path); Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("?" + file); // ? Path temp = Paths.get("target/origin-html-temp.txt"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); parseFile(temp.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } } catch (Exception e) { LOGGER.error("?", e); } } public static void parseFile(String file) { LOGGER.info("?" + file); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(new FileInputStream(file))))) { Map<String, List<String>> data = new HashMap<>(); String line = null; while ((line = reader.readLine()) != null) { parse(line, data); } save(data); } catch (IOException e) { LOGGER.error("?", e); } } public static void parse(String html, Map<String, List<String>> data) { LOGGER.debug("html:" + html); String[] attr = html.split("??"); if (attr == null || attr.length != 2) { LOGGER.error( "?'??'???????" + html); return; } String word = attr[0]; LOGGER.info("???" + word); String htm = attr[1]; parse(word, htm, data); } public static void showStatus(Map<String, List<String>> data, int current, int total, String word) { LOGGER.debug("?? " + current + "/" + total + " ? " + current / (float) total * 100 + "% " + word); data.entrySet().forEach(e -> { LOGGER.debug(e.getKey() + "\t" + e.getValue().size()); }); } public static void save(Map<String, List<String>> data) { LOGGER.info("??"); data.keySet().forEach(key -> { try { String path = "src/main/resources/word_" + key + ".txt"; LOGGER.error("??" + path); List<String> existWords = Files.readAllLines(Paths.get(path)); Set<String> allWords = new HashSet<>(); existWords.forEach(line -> { String[] attr = line.split("\\s+"); if (attr != null) { String w = ""; if (attr.length == 1) { w = attr[0]; } if (attr.length == 2) { w = attr[1]; } allWords.add(w); } }); allWords.addAll(data.get(key)); AtomicInteger i = new AtomicInteger(); List<String> list = allWords.stream().sorted().map(word -> i.incrementAndGet() + "\t" + word) .collect(Collectors.toList()); Files.write(Paths.get(path), list); data.get(key).clear(); existWords.clear(); allWords.clear(); list.clear(); } catch (Exception e) { LOGGER.error("??", e); } }); data.clear(); try { if (!NOT_FOUND_WORDS.isEmpty()) { String path = "src/main/resources/word_not_found.txt"; LOGGER.error("??" + path); AtomicInteger i = new AtomicInteger(); //NOT_FOUND_WORDS List<String> list = NOT_FOUND_WORDS.stream().sorted().map(word -> i.incrementAndGet() + "\t" + word) .collect(Collectors.toList()); Files.write(Paths.get(path), list); list.clear(); } //?HTML if (!ORIGIN_HTML.isEmpty()) { String path = "src/main/resources/origin_html_" + System.currentTimeMillis() + ".txt"; LOGGER.error("??" + path); Files.write(Paths.get(path), ORIGIN_HTML); ORIGIN_HTML.clear(); } } catch (Exception e) { LOGGER.error("??", e); } } public static String getContent(String word) { String url = ICIBA + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999) + 100000); LOGGER.debug("url:" + url); Connection conn = Jsoup.connect(url).header("Accept", ACCEPT).header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE).header("Connection", CONNECTION).header("Referer", REFERER) .header("Host", HOST).header("User-Agent", USER_AGENT).ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); } catch (Exception e) { LOGGER.error("?URL" + url + "?", e); } return html; } public static void parse(String word, String html, Map<String, List<String>> data) { Document doc = Jsoup.parse(html); Elements es = doc.select(TYPE_CSS_PATH); for (Element e : es) { String type = e.text(); LOGGER.debug("?" + type); if (StringUtils.isNotBlank(type)) { data.putIfAbsent(type, new ArrayList<>()); data.get(type).add(word); } } es = doc.select(UNFOUND_CSS_PATH); for (Element e : es) { String notFound = e.text(); LOGGER.debug("?" + notFound); if (StringUtils.isNotBlank(notFound) && (notFound.contains("?") || notFound.contains("??"))) { NOT_FOUND_WORDS.add(word); } } } public static void main(String[] args) { //Set<Word> words = new HashSet<>(); //words.add(new Word("time", "")); //words.add(new Word("yangshangchuan", "")); //classify(words); //classify(WordSources.getAll()); //parse("src/main/resources/origin_html_1427060576977.txt"); //origin_html.zip?61821????HTML??http://pan.baidu.com/s/1bnD9gy7 parse("/Users/apple/?/origin_html.zip"); } }