Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * ? * * @author ?? */ public class TextAnalyzer { private TextAnalyzer() { } private static final Pattern PATTERN = Pattern.compile("\\d+"); private static final Pattern UNICODE = Pattern.compile("[uU][0-9a-fA-F]{4}"); private static final Logger LOGGER = LoggerFactory.getLogger(TextAnalyzer.class); /** * @param files ? * @return ?? */ public static Map<String, AtomicInteger> frequency(Collection<String> files) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); files.forEach(file -> { LOGGER.info("parse text file: " + file); //? Map<String, AtomicInteger> data = frequency(file); //? data.entrySet().forEach(entry -> { map.putIfAbsent(entry.getKey(), new AtomicInteger()); map.get(entry.getKey()).addAndGet(entry.getValue().get()); }); data.clear(); }); LOGGER.info("total unique words count: " + map.size()); return map; } public static Map<String, AtomicInteger> frequency(String file) { try { return frequency(new FileInputStream(file)); } catch (IOException e) { e.printStackTrace(); } return Collections.emptyMap(); } public static Map<String, AtomicInteger> frequency(InputStream inputStream) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(inputStream)))) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } List<String> words = seg(line); words.forEach(word -> { map.putIfAbsent(word, new AtomicInteger()); map.get(word).incrementAndGet(); }); words.clear(); } } catch (IOException ex) { ex.printStackTrace(); } LOGGER.info("unique words count: " + map.size()); return map; } /** * ? * @param sentence * @return */ public static List<String> seg(String sentence) { List<String> data = new ArrayList<>(); //?? String[] words = sentence.trim().split("[^a-zA-Z0-9]"); StringBuilder log = new StringBuilder(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("??:" + sentence); } for (String word : words) { if (StringUtils.isBlank(word) || word.length() < 2) { continue; } List<String> list = new ArrayList<>(); //?? if (word.length() < 6 //PostgreSQL || (Character.isUpperCase(word.charAt(word.length() - 1)) && Character.isUpperCase(word.charAt(0))) //P2P,Neo4j || PATTERN.matcher(word).find() || StringUtils.isAllUpperCase(word)) { word = word.toLowerCase(); } //??? int last = 0; for (int i = 1; i < word.length(); i++) { if (Character.isUpperCase(word.charAt(i)) && Character.isLowerCase(word.charAt(i - 1))) { list.add(word.substring(last, i)); last = i; } } if (last < word.length()) { list.add(word.substring(last, word.length())); } list.stream().map(w -> w.toLowerCase()).forEach(w -> { if (w.length() < 2) { return; } w = irregularity(w); if (StringUtils.isNotBlank(w)) { data.add(w); if (LOGGER.isDebugEnabled()) { log.append(w).append(" "); } } }); } LOGGER.debug("?" + log); return data; } /** * ???? * @param word * @return */ private static String irregularity(String word) { if (Character.isDigit(word.charAt(0))) { LOGGER.debug("?" + word); return null; } if (word.startsWith("0x") || word.startsWith("0X")) { LOGGER.debug("?16" + word); return null; } if (word.endsWith("l") && StringUtils.isNumeric(word.substring(0, word.length() - 1))) { LOGGER.debug("?long" + word); return null; } if (UNICODE.matcher(word).find()) { LOGGER.debug("?UNICODE?" + word); return null; } switch (word) { //Ill do it. You'll see. case "ll": return "will"; //If youre already building applications using Spring. case "re": return "are"; //package com.manning.sdmia.ch04; case "ch": return "chapter"; //you find youve made a case "ve": return "have"; //but it doesnt stop there. case "doesn": return "does"; //but it isnt enough. case "isn": return "is"; //<input type="text" name="firstName" /><br/> case "br": return null; } return word; } /** * {? : ?} ?{? : ??10?} * @param data ? * @return ? */ public static Map<Integer, Stat> distribute(Map<String, AtomicInteger> data) { Map<Integer, Stat> stat = new HashMap<>(); data.entrySet().forEach(entry -> { Integer key = entry.getValue().get(); stat.putIfAbsent(key, new Stat()); stat.get(key).increment(); stat.get(key).addWords(entry.getKey()); }); return stat; } /** * ? * @param path */ public static void parse(String path) { //? Set<String> fileNames = getFileNames(path); //? Map<String, AtomicInteger> data = frequency(fileNames); // String htmlFragment = HtmlFormatter.toHtmlFragmentForText(data, fileNames); try { //? String resultFile = "target/words_" + Paths.get(path).toFile().getName().replace(".txt", "") + ".txt"; Files.write(Paths.get(resultFile), htmlFragment.getBytes("utf-8")); LOGGER.info("" + resultFile); } catch (IOException e) { e.printStackTrace(); } } public static Set<String> getFileNames(String path) { Set<String> fileNames = new HashSet<>(); if (Files.isDirectory(Paths.get(path))) { LOGGER.info("?" + path); } else { LOGGER.info("?" + path); fileNames.add(path); return fileNames; } try { Files.walkFileTree(Paths.get(path), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { if (file.toFile().getName().startsWith(".")) { return FileVisitResult.CONTINUE; } String fileName = file.toFile().getAbsolutePath(); if (!fileName.endsWith(".txt")) { LOGGER.info("??txt" + fileName); return FileVisitResult.CONTINUE; } fileNames.add(fileName); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { e.printStackTrace(); } return fileNames; } /** * * @param path ?? * @param limit ??? * @param isTopN ??? */ public static TreeMap<Float, String> sentence(String path, int limit, boolean isTopN) { //? Set<String> fileNames = getFileNames(path); //? Map<String, AtomicInteger> frequency = frequency(fileNames); //? TreeMap<Float, String> sentences = new TreeMap<>(); //?? int count = 0; for (String fileName : fileNames) { try (BufferedReader reader = new BufferedReader( new InputStreamReader(new BufferedInputStream(new FileInputStream(fileName))))) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } // float score = 0; List<String> words = seg(line); for (String word : words) { AtomicInteger fre = frequency.get(word); if (fre == null || fre.get() == 0) { LOGGER.error("????" + line); score = 0; break; } score += 1 / (float) fre.get(); } words.clear(); if (score > 0) { //??? if (sentences.get(score) != null) { continue; } sentences.put(score, line + " <u><i>" + Paths.get(fileName).toFile().getName().replace(".txt", "") + "</i></u>"); count++; if (count >= limit) { if (isTopN) { // sentences.pollFirstEntry(); } else { // sentences.pollLastEntry(); } } } } } catch (IOException ex) { LOGGER.error("??", ex); } } return sentences; } /** * ?? * @param textPath * @param dicPath */ public static void toDic(String textPath, String dicPath) { Map<String, AtomicInteger> data = frequency(getFileNames(textPath)); List<String> words = data.entrySet().stream() .filter(w -> StringUtils.isAlpha(w.getKey()) && w.getKey().length() < 12) .sorted((a, b) -> b.getValue().get() - a.getValue().get()) .map(e -> e.getValue() + "\t" + e.getKey()).collect(Collectors.toList()); try { Files.write(Paths.get(dicPath), words); } catch (IOException e) { LOGGER.error("??", e); } } /** * CET4?CET6?GRE?IELTS?TOEFL?? * * @param textPath * @return */ public static String importantWords(String textPath) { Set<Word> wordSet = WordSources.get("/word_CET4.txt", "/word_CET6.txt", "/word_GRE.txt", "/word_IELTS.txt", "/word_TOEFL.txt", "/word_ .txt"); Map<Word, AtomicInteger> data = WordSources.convert(frequency(getFileNames(textPath))); Set<Map.Entry<Word, AtomicInteger>> entries = data.entrySet().stream() .filter(entry -> wordSet.contains(entry.getKey())).collect(Collectors.toSet()); return HtmlFormatter.toHtmlTableFragment(entries, 5); } public static void main(String[] args) throws Exception { //parse("src/main/resources/it/spring/Spring in Action 4th Edition.txt"); //parse("src/main/resources/it/spring"); //parse("src/main/resources/it"); //toDic("src/main/resources/it", "src/main/resources/word_it.txt"); System.out.print(importantWords("src/main/resources/it")); } public static class Stat { private AtomicInteger count = new AtomicInteger(); private List<String> words = new ArrayList<>(); public int count() { return count.get(); } public void increment() { count.incrementAndGet(); } public List<String> getWords() { return words; } public void addWords(String word) { if (this.words.size() < 11) { this.words.add(word); } } } }