org.apdplat.superword.tools.PdfParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.superword.tools.PdfParser.java

Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apdplat.superword.model.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * pdf?txt
 * @author ??
 */
public class PdfParser {
    private PdfParser() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(PdfParser.class);

    private static final AutoDetectParser PARSER = new AutoDetectParser();
    private static final int SENTENCE_WORD_MIN_COUNT = 10;
    private static final int MAX_WORD_CHAR_COUNT = 18;
    private static final float SENTENCE_CAP_WORD_MAX_RATE = 0.4f;
    private static final Set<String> punctuation = new HashSet<>();
    private static final Set<Character> CORRUPT_CHAR = new HashSet<>();
    private static final Set<Word> DICTIONARY = WordSources.getAll();

    static {
        punctuation.add(",");
        punctuation.add("");
        punctuation.add("?");
        punctuation.add("");
        punctuation.add("");
        punctuation.add("");
        punctuation.add("");
        punctuation.add("-");
        punctuation.add("");
        punctuation.add(":");
        punctuation.add(";");
        punctuation.add("/");
        punctuation.add("+");
        punctuation.add("=");
        punctuation.add("==");
        punctuation.add("%");
        punctuation.add("!");
        punctuation.add("'");
        punctuation.add("\"");
        punctuation.add("[");
        punctuation.add("]");
        punctuation.add("(");
        punctuation.add(")");
        punctuation.add("");
        punctuation.add("?");
        punctuation.add("?");
    }

    private static final Map<Integer, AtomicInteger> SENTENCE_LENGTH_INFO = new ConcurrentHashMap<>();

    /**
     * PDF?
     * @param file PDF?
     * @return ???
     */
    public static String parsePdfFileToPlainText(String file) {
        try (InputStream stream = new FileInputStream(file)) {
            BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
            Metadata metadata = new Metadata();
            PARSER.parse(stream, handler, metadata);
            return handler.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    public static void parseDirectory(String dir) {
        parseDirectory(Paths.get(dir));
    }

    public static void parseDirectory(Path dir) {
        try {
            long start = System.currentTimeMillis();
            LOGGER.info("?" + dir);
            List<String> fileNames = new ArrayList<>();
            Files.walkFileTree(dir, new SimpleFileVisitor<Path>() {

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    String fileName = parseFile(file);
                    if (StringUtils.isNotBlank(fileName)) {
                        fileNames.add(fileName);
                    }
                    return FileVisitResult.CONTINUE;
                }

            });
            Files.write(Paths.get("src/main/resources/it/manifest"), fileNames);
            long cost = System.currentTimeMillis() - start;
            LOGGER.info("?" + cost + "");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void parseZip(String zipFile) {
        long start = System.currentTimeMillis();
        LOGGER.info("?ZIP" + zipFile);
        try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
            for (Path path : fs.getRootDirectories()) {
                LOGGER.info("?" + path);
                Files.walkFileTree(path, new SimpleFileVisitor<Path>() {

                    @Override
                    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                        LOGGER.info("?" + file);
                        // ?
                        Path temp = Paths.get("target/it-software-domain-temp.pdf");
                        Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
                        parseFile(temp.toFile().getAbsolutePath());
                        return FileVisitResult.CONTINUE;
                    }

                });
            }
        } catch (Exception e) {
            LOGGER.error("?", e);
        }
        long cost = System.currentTimeMillis() - start;
        LOGGER.info("?" + cost + "");
    }

    public static String parseFile(String file) {
        return parseFile(Paths.get(file));
    }

    public static String parseFile(Path file) {
        try {
            if (invalid(file)) {
                return null;
            }
            String sourceName = file.toFile().getAbsolutePath();
            String targetName = prepareTarget(file);
            LOGGER.info("?" + sourceName);
            LOGGER.info("?" + targetName);
            //?
            String text = parsePdfFileToPlainText(sourceName);
            //?
            List<String> sentences = toSentence(text);
            //?
            Files.write(Paths.get(targetName), sentences);
            return targetName.replace("src/main/resources", "");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    public static boolean paragraphFinish(String line) {
        //???
        if (StringUtils.isBlank(line)) {
            return true;
        }
        return false;
    }

    /**
     * ??
     * @param text
     * @return
     */
    private static List<String> toSentence(String text) {
        List<String> data = new ArrayList<>();
        StringBuilder paragraph = new StringBuilder();
        //PDF??
        String[] lines = text.split("[\n\r]");
        for (int i = 0; i < lines.length; i++) {
            String line = lines[i].trim();
            //??
            if (paragraphFinish(line)) {
                process(paragraph.toString().trim(), data);
                //?
                paragraph.setLength(0);
            }
            LOGGER.debug("PDF" + (i + 1) + "" + line);
            //-
            while (line.endsWith("-") || line.endsWith("?") || line.endsWith("") || line.endsWith("")
                    || line.endsWith("") || line.endsWith("") || line.endsWith("")) {
                LOGGER.debug("?");
                if ((i + 1) < lines.length) {
                    //-
                    String pre = line.substring(0, line.length() - 1);
                    //?
                    String n = lines[i + 1].trim();
                    if (StringUtils.isNotBlank(n)) {
                        LOGGER.debug("");
                        line = pre + n;
                    }
                    LOGGER.debug("PDF" + (i + 2) + "" + lines[i + 1]);
                    i++;
                } else {
                    LOGGER.debug("");
                    break;
                }
            }
            //?
            String lastLine = null;
            String nextLine = null;
            if (i - 1 > -1) {
                lastLine = lines[i - 1].trim();
            }
            if (i + 1 < lines.length) {
                nextLine = lines[i + 1].trim();
            }
            addLineToParagraph(line, lastLine, nextLine, paragraph);
        }
        //?
        process(paragraph.toString(), data);
        return data;
    }

    private static void addLineToParagraph(String line, String lastLine, String nextLine, StringBuilder paragraph) {
        if (StringUtils.isBlank(line)) {
            return;
        }
        if (nextLine != null) {
            //????
            if (Character.isDigit(line.charAt(0)) && Character.isAlphabetic(line.charAt(line.length() - 1))
            //?
                    && (StringUtils.isBlank(nextLine) || Character.isDigit(nextLine.charAt(0))
                            || Character.isUpperCase(nextLine.charAt(0)))) {
                LOGGER.debug("???" + line);
                return;
            }
        }
        paragraph.append(line).append(" ");
    }

    public static boolean isProgramCode(String paragraph) {
        if (//Java?
        paragraph.startsWith("package") || paragraph.startsWith("import") || paragraph.startsWith("public")
                || paragraph.startsWith("private") || paragraph.startsWith("/**") || paragraph.contains(");")
                || paragraph.contains("}") || paragraph.contains("{")
                //htmlxml
                || paragraph.startsWith("<")) {
            return true;
        }
        return false;
    }

    private static void process(String paragraph, List<String> data) {
        if (StringUtils.isNotBlank(paragraph)) {
            LOGGER.debug("?" + paragraph);
            //??
            if (paragraphValid(paragraph)) {
                //???
                List<String> sentences = segSentence(paragraph);
                if (!sentences.isEmpty()) {
                    data.addAll(sentences);
                }
            }
        }
    }

    public static boolean paragraphValid(String paragraph) {
        //??????
        //???
        char[] chars = paragraph.toCharArray();
        for (int i = 1; i < chars.length; i++) {
            char c = chars[i];
            /**
             8208 ?
             8209 
             8210 
             8211 
             8212 
             8213 
             8214 
             8215 
             8216 
             8217 
             8218 
             8219 
             8220 
             8221 ?
             8222 
             8223 
             8224 
             8225 
             8226 
             8227 
             8228 
             8229 
             8230 
             8231 
             */
            if (c >= 8208 && c <= 8231) {
                continue;
            }
            /**
             32
             33 !
             34 "
             35 #
             36 $
             37 %
             38 &
             39 '
             40 (
             41 )
             42 *
             43 +
             44 ,
             45 -
             46 .
             47 /
             48 0
             49 1
             50 2
             51 3
             52 4
             53 5
             54 6
             55 7
             56 8
             57 9
             58 :
             59 ;
             60 <
             61 =
             62 >
             63 ?
             64 @
             65 A
             66 B
             67 C
             68 D
             69 E
             70 F
             71 G
             72 H
             73 I
             74 J
             75 K
             76 L
             77 M
             78 N
             79 O
             80 P
             81 Q
             82 R
             83 S
             84 T
             85 U
             86 V
             87 W
             88 X
             89 Y
             90 Z
             91 [
             92 \
             93 ]
             94 ^
             95 _
             96 `
             97 a
             98 b
             99 c
             100 d
             101 e
             102 f
             103 g
             104 h
             105 i
             106 j
             107 k
             108 l
             109 m
             110 n
             111 o
             112 p
             113 q
             114 r
             115 s
             116 t
             117 u
             118 v
             119 w
             120 x
             121 y
             122 z
             123 {
             124 |
             125 }
             126 ~
             */
            if (c >= 32 && c <= 126) {
                continue;
            }
            /**
             *
             64256 
             64257 ?
             64258 
             64259 
             64260 
             64261 
             64262 
             */
            if (c >= 64256 && c <= 64262) {
                continue;
            }
            CORRUPT_CHAR.add(c);
            LOGGER.debug("??" + c + "=" + (int) c + "" + i
                    + "???" + paragraph);
            return false;
        }
        if (isProgramCode(paragraph)) {
            LOGGER.debug("?????" + paragraph);
            return false;
        }
        return true;
    }

    /**
     * ???
     * @param paragraph
     * @return
     */
    private static List<String> segSentence(String paragraph) {
        List<String> data = new ArrayList<>();
        //??
        paragraph = prepareSeg(paragraph);
        if (StringUtils.isBlank(paragraph)) {
            return data;
        }
        //???
        for (String s : paragraph.split("[.]")) {
            if (StringUtils.isBlank(s)) {
                continue;
            }
            LOGGER.debug("???" + s);
            s = processSentence(s);
            if (s == null) {
                continue;
            }
            //??
            if (Character.isAlphabetic(s.charAt(s.length() - 1))) {
                s += ".";
            }
            //.
            s = s.replace("??", ".");
            data.add(s);
            LOGGER.debug("??" + s);
            if (LOGGER.isDebugEnabled()) {
                int length = s.split("\\s+").length;
                //??
                SENTENCE_LENGTH_INFO.putIfAbsent(length, new AtomicInteger());
                SENTENCE_LENGTH_INFO.get(length).incrementAndGet();
            }
        }
        return data;
    }

public static String processSentence(String sentence){
    //
    if(StringUtils.isBlank(sentence)){
        LOGGER.debug("??" + sentence);
        return null;
    }
    sentence = sentence.trim();
    if(sentence.endsWith(",")){
        LOGGER.debug("????"+sentence);
        return null;
    }
    //??
    int i=0;
    for(char c : sentence.toCharArray()){
        if(Character.isAlphabetic(c)){
            break;
        }
        i++;
    }
    if(i>=sentence.length()){
        LOGGER.debug("???" + sentence);
        return null;
    }
    if(i>0) {
        sentence = sentence.substring(i);
    }
    if(StringUtils.isBlank(sentence)){
        LOGGER.debug("??" + sentence);
        return null;
    }
    //????
    if(!Character.isUpperCase(sentence.charAt(0))){
        LOGGER.debug("????" + sentence);
        return null;
    }
    String[] words = sentence.split("\\s+");
    if(words[0].length() == 1
            && !"A".equals(words[0])
            && !"I".equals(words[0])){
        LOGGER.debug("??????" + sentence);
        return null;
    }
    if(words[0].length() > 1 && StringUtils.isAllUpperCase(words[0])){
        LOGGER.debug("????" + sentence);
        return null;
    }
    //??
    if(words.length < SENTENCE_WORD_MIN_COUNT){
        LOGGER.debug("?" + SENTENCE_WORD_MIN_COUNT + "??" + sentence);
        return null;
    }
    //????
    if(StringUtils.isNumeric(words[words.length-1])){
        LOGGER.debug("???" + words[words.length-1] + "??" + sentence);
        return null;
    }
    //?????
    int capWordCount = 0;
    //??
    int maxWordCharCount = 0;
    for(String word : words){
        if(Character.isUpperCase(word.charAt(0))){
            capWordCount++;
        }
        if(!word.contains("http://") && word.length() > maxWordCharCount){
            maxWordCharCount = word.length();
        }
    }
    if(capWordCount > words.length*SENTENCE_CAP_WORD_MAX_RATE){
        LOGGER.debug("???" + capWordCount + "" + words.length*SENTENCE_CAP_WORD_MAX_RATE + "??" + sentence);
        return null;
    }
    if(maxWordCharCount > MAX_WORD_CHAR_COUNT){
        LOGGER.debug("??????" + maxWordCharCount + "" + MAX_WORD_CHAR_COUNT + "??" + sentence);
        return null;
    }
    //??????
    int specialWordCount = 0;
    for(String word : words){
        for(String c : punctuation){
            word = word.replace(c, "");
        }
        if(StringUtils.isNotBlank(word)
                && !StringUtils.isAlpha(word)){
            LOGGER.debug("????"+word);
            specialWordCount++;
        }
    }
    if(specialWordCount > Math.log(words.length)/2){
        LOGGER.debug(""+words.length+"????" + specialWordCount + "" + Math.log(words.length)/2 + "??" + sentence);
        return null;
    }
    //????
    int notWordCount = 0;
    Set<String> toCheck = TextAnalyzer.seg(sentence).stream().collect(Collectors.toSet());
    LOGGER.debug("???"+toCheck.size());
    for(String word : toCheck){
        if(!DICTIONARY.contains(new Word(word.toLowerCase(), ""))){
            LOGGER.debug("??"+word);
            notWordCount++;
        }
    }
    LOGGER.debug("??"+notWordCount);
    if(notWordCount > toCheck.size()*0.4){
        LOGGER.debug("????" + notWordCount + "" + toCheck.size()*0.4 + "??" + sentence);
        return null;
    }
    //[]()??
    if(sentence.contains("[")
            || sentence.contains("]")
            || sentence.contains("(")
            || sentence.contains(")")
            || sentence.contains("")
            || sentence.contains("?")
            || sentence.contains("\"")){
        char[] chars = sentence.toCharArray();
        int pre=0;
        int suf=0;
        int quotCount=0;
        for(int j=0; j<chars.length; j++){
            char c = chars[j];
            switch (c){
                case '[': LOGGER.debug("?"+c+""+j);pre++;break;
                case '(': LOGGER.debug("?"+c+""+j);pre++;break;
                case ']': LOGGER.debug("?"+c+""+j);suf++;break;
                case ')': LOGGER.debug("?"+c+""+j);suf++;break;
                case '': LOGGER.debug("?"+c+""+j);pre++;break;
                case '?': LOGGER.debug("?"+c+""+j);suf++;break;
                case '"': LOGGER.debug("?"+c+""+j);quotCount++;break;
            }
        }
        if(pre != suf){
            LOGGER.debug("[]()???"+pre+"??"+suf);
            return null;
        }
        if(quotCount%2==1){
            LOGGER.debug("[]()???"+quotCount);
            return null;
        }
    }
    return sentence;
}

    /**
     * ???????
     * @param paragraph
     * @return
     */
    private static String prepareSeg(String paragraph) {
        paragraph = paragraph.replace(".)", ". ");
        paragraph = paragraph.replace("!)", ". ");
        if (paragraph.contains(".")) {
            paragraph = paragraph.trim();
            StringBuilder data = new StringBuilder();
            int index = 0;
            int last = 0;
            boolean r = false;
            while ((index = paragraph.indexOf(".", index)) > -1) {
                boolean remain = false;
                if (index + 1 < paragraph.length()) {
                    if (Character.isWhitespace(paragraph.charAt(index + 1))) {
                        remain = true;
                    }
                }
                if (index == paragraph.length() - 1) {
                    remain = true;
                }
                if (!remain) {
                    data.append(paragraph.substring(last, index)).append("??");
                    r = true;
                } else {
                    data.append(paragraph.substring(last, index + 1));
                }
                index++;
                last = index;
            }
            if (last < paragraph.length()) {
                data.append(paragraph.substring(last, paragraph.length()));
            }
            paragraph = data.toString();
            if (r) {
                LOGGER.debug(".??" + paragraph);
            }
        }
        return paragraph;
    }

    /**
     * ???PDF
     * @param file
     * @return
     */
    private static boolean invalid(Path file) {
        if (file.toFile().getName().startsWith(".")) {
            return true;
        }
        String fileName = file.toFile().getAbsolutePath();
        if (!fileName.endsWith(".pdf")) {
            LOGGER.info("??PDF" + fileName);
            return true;
        }
        return false;
    }

    /**
     * PDF???
     * @param file
     * @return
     */
    private static String prepareTarget(Path file) {
        try {
            String fileName = file.toFile().getAbsolutePath();
            String targetName = "src/main/resources/it" + fileName
                    .replace(file.getParent().getParent().toFile().getAbsolutePath(), "").replace(".pdf", "")
                    + ".txt";
            Path target = Paths.get(targetName);
            //??
            Files.deleteIfExists(target);
            //
            if (Files.notExists(target.getParent())) {
                Files.createDirectories(target.getParent());
            }
            return targetName;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void resetSentenceWordLengthInfo() {
        SENTENCE_LENGTH_INFO.clear();
    }

    public static void showSentenceWordLengthInfo() {
        LOGGER.debug("???");
        SENTENCE_LENGTH_INFO.keySet().stream().sorted().forEach(k -> {
            LOGGER.debug(k + " -> " + SENTENCE_LENGTH_INFO.get(k));
        });
        LOGGER.debug("");
        CORRUPT_CHAR.stream().sorted().forEach(c -> LOGGER.debug((int) c + "=" + c.toString()));
    }

    public static void main(String[] args) throws Exception {
        resetSentenceWordLengthInfo();
        //???
        //String file = "/Users/apple/?/???/activemq/ActiveMQ in Action.pdf";
        //parseFile(file);
        //????
        //String path = "/Users/apple/?/???/cassandra";
        //???
        //String path = "/Users/apple/?/???";
        //???
        //parseDirectory(path);
        //it-software-domain.zip249IT?249?
        //???
        //it-software-domain.zip?http://pan.baidu.com/s/1kT1NA3l
        parseZip(
                "/Users/apple/?/???/it-software-domain.zip");
        showSentenceWordLengthInfo();
    }
}