Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.rule; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.tools.WordLinker; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; /** * ? * * @author ?? */ public class TextAnalysis { private TextAnalysis() { } private static final Logger LOGGER = LoggerFactory.getLogger(TextAnalysis.class); /** * @param files ? * @return ?? */ public static Map<String, AtomicInteger> frequency(Collection<String> files) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); for (String file : files) { LOGGER.info("parse text file: " + file); //? Map<String, AtomicInteger> data = frequency(file); //? data.entrySet().forEach(entry -> { map.putIfAbsent(entry.getKey(), new AtomicInteger()); map.get(entry.getKey()).addAndGet(entry.getValue().get()); }); data.clear(); } LOGGER.info("total unique words count: " + map.size()); return map; } public static Map<String, AtomicInteger> frequency(String file) { try { return frequency(new FileInputStream(file)); } catch (IOException e) { e.printStackTrace(); } return Collections.emptyMap(); } public static Map<String, AtomicInteger> frequency(InputStream inputStream) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(inputStream)))) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } List<String> words = seg(line); words.forEach(word -> { map.putIfAbsent(word, new AtomicInteger()); map.get(word).incrementAndGet(); }); words.clear(); } } catch (IOException ex) { ex.printStackTrace(); } LOGGER.info("unique words count: " + map.size()); return map; } public static List<String> seg(String sentence) { return seg(sentence, false); } /** * ? * @param sentence * @param debug ??? * @return */ public static List<String> seg(String sentence, boolean debug) { List<String> data = new ArrayList<>(); //?? String[] words = sentence.trim().split("[^a-zA-Z]"); StringBuilder log = new StringBuilder(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("??:" + sentence); } for (String word : words) { if (StringUtils.isBlank(word)) { continue; } List<String> list = new ArrayList<String>(); //?6??? if (word.length() < 6 || StringUtils.isAllUpperCase(word)) { word = word.toLowerCase(); } //??? int last = 0; for (int i = 1; i < word.length(); i++) { if (Character.isUpperCase(word.charAt(i)) && Character.isLowerCase(word.charAt(i - 1))) { list.add(word.substring(last, i)); last = i; } } if (last < word.length()) { list.add(word.substring(last, word.length())); } list.stream().map(w -> w.toLowerCase()).forEach(w -> { if (w.length() < 2) { return; } w = irregularity(w); if (StringUtils.isNotBlank(w)) { data.add(w); if (LOGGER.isDebugEnabled()) { log.append(w).append(" "); } } }); } LOGGER.debug("?" + log); return data; } /** * ???? * @param word * @return */ private static String irregularity(String word) { switch (word) { //Ill do it. You'll see. case "ll": return "will"; //If youre already building applications using Spring. case "re": return "are"; //package com.manning.sdmia.ch04; case "ch": return "chapter"; //you find youve made a case "ve": return "have"; //but it doesnt stop there. case "doesn": return "does"; //but it isnt enough. case "isn": return "is"; //<input type="text" name="firstName" /><br/> case "br": return null; } return word; } /** * {? : ?} ?{? : ??10?} * @param data ? * @return ? */ public static Map<Integer, Stat> distribute(Map<String, AtomicInteger> data) { Map<Integer, Stat> stat = new HashMap<>(); data.entrySet().forEach(entry -> { Integer key = entry.getValue().get(); stat.putIfAbsent(key, new Stat()); stat.get(key).increment(); stat.get(key).addWords(entry.getKey()); }); return stat; } public static String toHtmlFragment(Map<String, AtomicInteger> data, Set<String> fileNames) { StringBuilder html = new StringBuilder(); html.append("?<br/>\n"); AtomicInteger i = new AtomicInteger(); fileNames.stream().sorted().forEach(fileName -> html.append(i.incrementAndGet()).append("?") .append(Paths.get(fileName).toFile().getName().replace(".txt", "")).append("<br/>\n")); Map<Integer, Stat> stat = distribute(data); html.append("").append(data.size()).append("??<br/>\n").append( "<table border=\"1\" bordercolor=\"#00CCCC\" width=\"850\">\n\t<tr><td>??</td><td></td><td>??</td><td>??</td></tr>\n"); AtomicInteger k = new AtomicInteger(); stat.keySet().stream().sorted((a, b) -> b - a).forEach(s -> { html.append("\t<tr><td>").append(k.incrementAndGet()).append("</td><td>").append(s).append("</td><td>") .append(stat.get(s).count()).append("</td><td>"); AtomicInteger z = new AtomicInteger(); List<String> list = stat.get(s).getWords(); list.stream().sorted().forEach(w -> { if (list.size() > 1) { html.append(z.incrementAndGet()).append(".").append(WordLinker.toLink(w)).append(" "); } else if (list.size() == 1) { html.append(WordLinker.toLink(w)); } }); html.append("</td></tr>\n"); }); html.append("</table>").append("\n(").append(data.size()).append(")??<br/>\n") .append("<table>\n\t<tr><td>??</td><td>??</td><td>?</td></tr>\n"); AtomicInteger wordCounter = new AtomicInteger(); data.entrySet().stream().filter(entry -> entry.getKey().length() <= 14) .sorted((a, b) -> b.getValue().get() - a.getValue().get()).forEach(entry -> { html.append("\t").append("<tr><td>").append(wordCounter.incrementAndGet()).append("</td><td>") .append(WordLinker.toLink(entry.getKey())).append("</td><td>") .append(entry.getValue().get()).append("</td></tr>\n"); }); html.append("</table>\n").append("14?") .append("\n<table>\n\t<tr><td>??</td><td>??</td><td>?</td></tr>\n"); AtomicInteger j = new AtomicInteger(); data.entrySet().stream().filter(entry -> entry.getKey().length() > 14) .sorted((a, b) -> b.getValue().get() - a.getValue().get()) .forEach(entry -> html.append("\t").append("<tr><td>").append(j.incrementAndGet()) .append("</td><td>").append(WordLinker.toLink(entry.getKey())).append("</td><td>") .append(entry.getValue().get()).append("</td></tr>\n")); html.append("</table>\n").append("2?") .append("\n<table>\n\t<tr><td>??</td><td>??</td><td>?</td></tr>\n"); AtomicInteger z = new AtomicInteger(); data.entrySet().stream().filter(entry -> entry.getKey().length() == 2) .sorted((a, b) -> b.getValue().get() - a.getValue().get()) .forEach(entry -> html.append("\t").append("<tr><td>").append(z.incrementAndGet()) .append("</td><td>").append(WordLinker.toLink(entry.getKey())).append("</td><td>") .append(entry.getValue().get()).append("</td></tr>\n")); html.append("</table>"); return html.toString(); } /** * ? * @param path */ public static void parse(String path) { //? Set<String> fileNames = getFileNames(path); //? Map<String, AtomicInteger> data = frequency(fileNames); // String htmlFragment = toHtmlFragment(data, fileNames); try { //? String resultFile = "target/words_" + Paths.get(path).toFile().getName().replace(".txt", "") + ".txt"; Files.write(Paths.get(resultFile), htmlFragment.getBytes("utf-8")); LOGGER.info("" + resultFile); } catch (IOException e) { e.printStackTrace(); } } public static Set<String> getFileNames(String path) { Set<String> fileNames = new HashSet<>(); if (Files.isDirectory(Paths.get(path))) { LOGGER.info("?" + path); } else { LOGGER.info("?" + path); fileNames.add(path); return fileNames; } try { Files.walkFileTree(Paths.get(path), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { if (file.toFile().getName().startsWith(".")) { return FileVisitResult.CONTINUE; } String fileName = file.toFile().getAbsolutePath(); if (!fileName.endsWith(".txt")) { LOGGER.info("??txt" + fileName); return FileVisitResult.CONTINUE; } fileNames.add(fileName); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { e.printStackTrace(); } return fileNames; } public static Map<String, List<String>> findEvidence(Path dir, List<String> words) { LOGGER.info("?" + dir); Map<String, List<String>> data = new HashMap<>(); try { Files.walkFileTree(dir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { String fileName = file.toFile().getAbsolutePath(); if (file.toFile().getName().startsWith(".")) { return FileVisitResult.CONTINUE; } if (!fileName.endsWith(".txt")) { LOGGER.info("??txt" + fileName); return FileVisitResult.CONTINUE; } LOGGER.info("?" + fileName); List<String> lines = Files.readAllLines(file); for (int i = 0; i < lines.size(); i++) { final String line = lines.get(i); final int index = i; words.forEach(word -> { if (line.toLowerCase().contains(word)) { data.putIfAbsent(word, new ArrayList<>()); data.get(word).add(line + " <u><i>" + file.toFile().getName().replace(".txt", "") + "</i></u>"); } }); } return FileVisitResult.CONTINUE; } }); } catch (IOException e) { e.printStackTrace(); } return data; } public static String toHtmlFragment(Map<String, List<String>> data) { StringBuilder html = new StringBuilder(); AtomicInteger i = new AtomicInteger(); data.keySet().stream().forEach(word -> { StringBuilder p = new StringBuilder(); for (char c : word.toCharArray()) { p.append("[").append(Character.toUpperCase(c)).append(Character.toLowerCase(c)).append("]{1}"); } html.append(i.incrementAndGet()).append("??? ").append(WordLinker.toLink(word)) .append(" ?<br/>\n"); html.append("<ol>\n"); data.get(word) .forEach(t -> html.append("\t<li>") .append(t.replaceAll(p.toString(), "<font color=\"red\">" + word + "</font>")) .append("</li><br/>\n")); html.append("</ol><br/>\n"); }); return html.toString(); } public static void summary() { List<String> words = Arrays.asList("resurgent", "categorically", "misleadingly", "weightings", "uniques", "alphanumerics", "misspell", "conducive", "dissection", "marvel", "graciously", "inspections", "appetite", "visualizations", "commonalities", "dissecting", "fidelity", "creativity", "coyote", "reaction"); Map<String, List<String>> data = findEvidence(Paths.get("src/main/resources/it"), words); String html = toHtmlFragment(data); LOGGER.info(html); } public static void main(String[] args) throws Exception { //parse("src/main/resources/it/spring/Spring in Action 4th Edition.txt"); //parse("src/main/resources/it/spring"); parse("src/main/resources/it"); summary(); } private static class Stat { private AtomicInteger count = new AtomicInteger(); private List<String> words = new ArrayList<>(); public int count() { return count.get(); } public void increment() { count.incrementAndGet(); } public List<String> getWords() { return words; } public void addWords(String word) { if (this.words.size() < 11) { this.words.add(word); } } } }