Java tutorial
/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, ??, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.apdplat.superword.model.Word; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * pdf?txt * @author ?? */ public class PdfParser { private PdfParser() { } private static final Logger LOGGER = LoggerFactory.getLogger(PdfParser.class); private static final AutoDetectParser PARSER = new AutoDetectParser(); private static final int SENTENCE_WORD_MIN_COUNT = 10; private static final int MAX_WORD_CHAR_COUNT = 18; private static final float SENTENCE_CAP_WORD_MAX_RATE = 0.4f; private static final Set<String> punctuation = new HashSet<>(); private static final Set<Character> CORRUPT_CHAR = new HashSet<>(); private static final Set<Word> DICTIONARY = WordSources.getAll(); static { punctuation.add(","); punctuation.add(""); punctuation.add("?"); punctuation.add(""); punctuation.add(""); punctuation.add(""); punctuation.add(""); punctuation.add("-"); punctuation.add(""); punctuation.add(":"); punctuation.add(";"); punctuation.add("/"); punctuation.add("+"); punctuation.add("="); punctuation.add("=="); punctuation.add("%"); punctuation.add("!"); punctuation.add("'"); punctuation.add("\""); punctuation.add("["); punctuation.add("]"); punctuation.add("("); punctuation.add(")"); punctuation.add(""); punctuation.add("?"); punctuation.add("?"); } private static final Map<Integer, AtomicInteger> SENTENCE_LENGTH_INFO = new ConcurrentHashMap<>(); /** * PDF? * @param file PDF? * @return ??? */ public static String parsePdfFileToPlainText(String file) { try (InputStream stream = new FileInputStream(file)) { BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); PARSER.parse(stream, handler, metadata); return handler.toString(); } catch (Exception e) { e.printStackTrace(); } return ""; } public static void parseDirectory(String dir) { parseDirectory(Paths.get(dir)); } public static void parseDirectory(Path dir) { try { long start = System.currentTimeMillis(); LOGGER.info("?" + dir); List<String> fileNames = new ArrayList<>(); Files.walkFileTree(dir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { String fileName = parseFile(file); if (StringUtils.isNotBlank(fileName)) { fileNames.add(fileName); } return FileVisitResult.CONTINUE; } }); Files.write(Paths.get("src/main/resources/it/manifest"), fileNames); long cost = System.currentTimeMillis() - start; LOGGER.info("?" + cost + ""); } catch (IOException e) { e.printStackTrace(); } } public static void parseZip(String zipFile) { long start = System.currentTimeMillis(); LOGGER.info("?ZIP" + zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { for (Path path : fs.getRootDirectories()) { LOGGER.info("?" + path); Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("?" + file); // ? Path temp = Paths.get("target/it-software-domain-temp.pdf"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); parseFile(temp.toFile().getAbsolutePath()); return FileVisitResult.CONTINUE; } }); } } catch (Exception e) { LOGGER.error("?", e); } long cost = System.currentTimeMillis() - start; LOGGER.info("?" + cost + ""); } public static String parseFile(String file) { return parseFile(Paths.get(file)); } public static String parseFile(Path file) { try { if (invalid(file)) { return null; } String sourceName = file.toFile().getAbsolutePath(); String targetName = prepareTarget(file); LOGGER.info("?" + sourceName); LOGGER.info("?" + targetName); //? String text = parsePdfFileToPlainText(sourceName); //? List<String> sentences = toSentence(text); //? Files.write(Paths.get(targetName), sentences); return targetName.replace("src/main/resources", ""); } catch (Exception e) { e.printStackTrace(); } return ""; } public static boolean paragraphFinish(String line) { //??? if (StringUtils.isBlank(line)) { return true; } return false; } /** * ?? * @param text * @return */ private static List<String> toSentence(String text) { List<String> data = new ArrayList<>(); StringBuilder paragraph = new StringBuilder(); //PDF?? String[] lines = text.split("[\n\r]"); for (int i = 0; i < lines.length; i++) { String line = lines[i].trim(); //?? if (paragraphFinish(line)) { process(paragraph.toString().trim(), data); //? paragraph.setLength(0); } LOGGER.debug("PDF" + (i + 1) + "" + line); //- while (line.endsWith("-") || line.endsWith("?") || line.endsWith("") || line.endsWith("") || line.endsWith("") || line.endsWith("") || line.endsWith("")) { LOGGER.debug("?"); if ((i + 1) < lines.length) { //- String pre = line.substring(0, line.length() - 1); //? String n = lines[i + 1].trim(); if (StringUtils.isNotBlank(n)) { LOGGER.debug(""); line = pre + n; } LOGGER.debug("PDF" + (i + 2) + "" + lines[i + 1]); i++; } else { LOGGER.debug(""); break; } } //? String lastLine = null; String nextLine = null; if (i - 1 > -1) { lastLine = lines[i - 1].trim(); } if (i + 1 < lines.length) { nextLine = lines[i + 1].trim(); } addLineToParagraph(line, lastLine, nextLine, paragraph); } //? process(paragraph.toString(), data); return data; } private static void addLineToParagraph(String line, String lastLine, String nextLine, StringBuilder paragraph) { if (StringUtils.isBlank(line)) { return; } if (nextLine != null) { //???? if (Character.isDigit(line.charAt(0)) && Character.isAlphabetic(line.charAt(line.length() - 1)) //? && (StringUtils.isBlank(nextLine) || Character.isDigit(nextLine.charAt(0)) || Character.isUpperCase(nextLine.charAt(0)))) { LOGGER.debug("???" + line); return; } } paragraph.append(line).append(" "); } public static boolean isProgramCode(String paragraph) { if (//Java? paragraph.startsWith("package") || paragraph.startsWith("import") || paragraph.startsWith("public") || paragraph.startsWith("private") || paragraph.startsWith("/**") || paragraph.contains(");") || paragraph.contains("}") || paragraph.contains("{") //htmlxml || paragraph.startsWith("<")) { return true; } return false; } private static void process(String paragraph, List<String> data) { if (StringUtils.isNotBlank(paragraph)) { LOGGER.debug("?" + paragraph); //?? if (paragraphValid(paragraph)) { //??? List<String> sentences = segSentence(paragraph); if (!sentences.isEmpty()) { data.addAll(sentences); } } } } public static boolean paragraphValid(String paragraph) { //?????? //??? char[] chars = paragraph.toCharArray(); for (int i = 1; i < chars.length; i++) { char c = chars[i]; /** 8208 ? 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 ? 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 */ if (c >= 8208 && c <= 8231) { continue; } /** 32 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ */ if (c >= 32 && c <= 126) { continue; } /** * 64256 64257 ? 64258 64259 64260 64261 64262 */ if (c >= 64256 && c <= 64262) { continue; } CORRUPT_CHAR.add(c); LOGGER.debug("??" + c + "=" + (int) c + "" + i + "???" + paragraph); return false; } if (isProgramCode(paragraph)) { LOGGER.debug("?????" + paragraph); return false; } return true; } /** * ??? * @param paragraph * @return */ private static List<String> segSentence(String paragraph) { List<String> data = new ArrayList<>(); //?? paragraph = prepareSeg(paragraph); if (StringUtils.isBlank(paragraph)) { return data; } //??? for (String s : paragraph.split("[.]")) { if (StringUtils.isBlank(s)) { continue; } LOGGER.debug("???" + s); s = processSentence(s); if (s == null) { continue; } //?? if (Character.isAlphabetic(s.charAt(s.length() - 1))) { s += "."; } //. s = s.replace("??", "."); data.add(s); LOGGER.debug("??" + s); if (LOGGER.isDebugEnabled()) { int length = s.split("\\s+").length; //?? SENTENCE_LENGTH_INFO.putIfAbsent(length, new AtomicInteger()); SENTENCE_LENGTH_INFO.get(length).incrementAndGet(); } } return data; } public static String processSentence(String sentence){ // if(StringUtils.isBlank(sentence)){ LOGGER.debug("??" + sentence); return null; } sentence = sentence.trim(); if(sentence.endsWith(",")){ LOGGER.debug("????"+sentence); return null; } //?? int i=0; for(char c : sentence.toCharArray()){ if(Character.isAlphabetic(c)){ break; } i++; } if(i>=sentence.length()){ LOGGER.debug("???" + sentence); return null; } if(i>0) { sentence = sentence.substring(i); } if(StringUtils.isBlank(sentence)){ LOGGER.debug("??" + sentence); return null; } //???? if(!Character.isUpperCase(sentence.charAt(0))){ LOGGER.debug("????" + sentence); return null; } String[] words = sentence.split("\\s+"); if(words[0].length() == 1 && !"A".equals(words[0]) && !"I".equals(words[0])){ LOGGER.debug("??????" + sentence); return null; } if(words[0].length() > 1 && StringUtils.isAllUpperCase(words[0])){ LOGGER.debug("????" + sentence); return null; } //?? if(words.length < SENTENCE_WORD_MIN_COUNT){ LOGGER.debug("?" + SENTENCE_WORD_MIN_COUNT + "??" + sentence); return null; } //???? if(StringUtils.isNumeric(words[words.length-1])){ LOGGER.debug("???" + words[words.length-1] + "??" + sentence); return null; } //????? int capWordCount = 0; //?? int maxWordCharCount = 0; for(String word : words){ if(Character.isUpperCase(word.charAt(0))){ capWordCount++; } if(!word.contains("http://") && word.length() > maxWordCharCount){ maxWordCharCount = word.length(); } } if(capWordCount > words.length*SENTENCE_CAP_WORD_MAX_RATE){ LOGGER.debug("???" + capWordCount + "" + words.length*SENTENCE_CAP_WORD_MAX_RATE + "??" + sentence); return null; } if(maxWordCharCount > MAX_WORD_CHAR_COUNT){ LOGGER.debug("??????" + maxWordCharCount + "" + MAX_WORD_CHAR_COUNT + "??" + sentence); return null; } //?????? int specialWordCount = 0; for(String word : words){ for(String c : punctuation){ word = word.replace(c, ""); } if(StringUtils.isNotBlank(word) && !StringUtils.isAlpha(word)){ LOGGER.debug("????"+word); specialWordCount++; } } if(specialWordCount > Math.log(words.length)/2){ LOGGER.debug(""+words.length+"????" + specialWordCount + "" + Math.log(words.length)/2 + "??" + sentence); return null; } //???? int notWordCount = 0; Set<String> toCheck = TextAnalyzer.seg(sentence).stream().collect(Collectors.toSet()); LOGGER.debug("???"+toCheck.size()); for(String word : toCheck){ if(!DICTIONARY.contains(new Word(word.toLowerCase(), ""))){ LOGGER.debug("??"+word); notWordCount++; } } LOGGER.debug("??"+notWordCount); if(notWordCount > toCheck.size()*0.4){ LOGGER.debug("????" + notWordCount + "" + toCheck.size()*0.4 + "??" + sentence); return null; } //[]()?? if(sentence.contains("[") || sentence.contains("]") || sentence.contains("(") || sentence.contains(")") || sentence.contains("") || sentence.contains("?") || sentence.contains("\"")){ char[] chars = sentence.toCharArray(); int pre=0; int suf=0; int quotCount=0; for(int j=0; j<chars.length; j++){ char c = chars[j]; switch (c){ case '[': LOGGER.debug("?"+c+""+j);pre++;break; case '(': LOGGER.debug("?"+c+""+j);pre++;break; case ']': LOGGER.debug("?"+c+""+j);suf++;break; case ')': LOGGER.debug("?"+c+""+j);suf++;break; case '': LOGGER.debug("?"+c+""+j);pre++;break; case '?': LOGGER.debug("?"+c+""+j);suf++;break; case '"': LOGGER.debug("?"+c+""+j);quotCount++;break; } } if(pre != suf){ LOGGER.debug("[]()???"+pre+"??"+suf); return null; } if(quotCount%2==1){ LOGGER.debug("[]()???"+quotCount); return null; } } return sentence; } /** * ??????? * @param paragraph * @return */ private static String prepareSeg(String paragraph) { paragraph = paragraph.replace(".)", ". "); paragraph = paragraph.replace("!)", ". "); if (paragraph.contains(".")) { paragraph = paragraph.trim(); StringBuilder data = new StringBuilder(); int index = 0; int last = 0; boolean r = false; while ((index = paragraph.indexOf(".", index)) > -1) { boolean remain = false; if (index + 1 < paragraph.length()) { if (Character.isWhitespace(paragraph.charAt(index + 1))) { remain = true; } } if (index == paragraph.length() - 1) { remain = true; } if (!remain) { data.append(paragraph.substring(last, index)).append("??"); r = true; } else { data.append(paragraph.substring(last, index + 1)); } index++; last = index; } if (last < paragraph.length()) { data.append(paragraph.substring(last, paragraph.length())); } paragraph = data.toString(); if (r) { LOGGER.debug(".??" + paragraph); } } return paragraph; } /** * ???PDF * @param file * @return */ private static boolean invalid(Path file) { if (file.toFile().getName().startsWith(".")) { return true; } String fileName = file.toFile().getAbsolutePath(); if (!fileName.endsWith(".pdf")) { LOGGER.info("??PDF" + fileName); return true; } return false; } /** * PDF??? * @param file * @return */ private static String prepareTarget(Path file) { try { String fileName = file.toFile().getAbsolutePath(); String targetName = "src/main/resources/it" + fileName .replace(file.getParent().getParent().toFile().getAbsolutePath(), "").replace(".pdf", "") + ".txt"; Path target = Paths.get(targetName); //?? Files.deleteIfExists(target); // if (Files.notExists(target.getParent())) { Files.createDirectories(target.getParent()); } return targetName; } catch (Exception e) { e.printStackTrace(); } return null; } public static void resetSentenceWordLengthInfo() { SENTENCE_LENGTH_INFO.clear(); } public static void showSentenceWordLengthInfo() { LOGGER.debug("???"); SENTENCE_LENGTH_INFO.keySet().stream().sorted().forEach(k -> { LOGGER.debug(k + " -> " + SENTENCE_LENGTH_INFO.get(k)); }); LOGGER.debug(""); CORRUPT_CHAR.stream().sorted().forEach(c -> LOGGER.debug((int) c + "=" + c.toString())); } public static void main(String[] args) throws Exception { resetSentenceWordLengthInfo(); //??? //String file = "/Users/apple/?/???/activemq/ActiveMQ in Action.pdf"; //parseFile(file); //???? //String path = "/Users/apple/?/???/cassandra"; //??? //String path = "/Users/apple/?/???"; //??? //parseDirectory(path); //it-software-domain.zip249IT?249? //??? //it-software-domain.zip?http://pan.baidu.com/s/1kT1NA3l parseZip( "/Users/apple/?/???/it-software-domain.zip"); showSentenceWordLengthInfo(); } }