de.iisys.schub.processMining.similarity.parsing.DocxParser.java Source code

Introduction

Here is the source code for de.iisys.schub.processMining.similarity.parsing.DocxParser.java
Source

package de.iisys.schub.processMining.similarity.parsing;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;

/**
 * Class to parse docx and doc files.
 * For just getting the text of a doc file, you can use the static method "parseDocSimple".
 * For parsing a docx file, you have to create an instance of this class.
 * 
 * Uses the Apache POI library for Word (https://poi.apache.org/).
 * 
 * 
 * LICENSE:
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * @author Christian Ochsenkhn
 *
 */
public class DocxParser {

    private final String HEADING1_EN = "heading1";
    private final String HEADING1_DE = "berschrift1";

    private final List<String> CHAPTER_STYLES;

    private String path;
    private XWPFDocument theDoc;
    private String fullText;
    private List<String> chapterTexts;
    private List<String> chapterHeadlines;

    private List<List<IBodyElement>> chapters;

    private DocxParser() {
        CHAPTER_STYLES = new ArrayList<String>();
        CHAPTER_STYLES.add(HEADING1_DE);
        CHAPTER_STYLES.add(HEADING1_EN);
    }

    public DocxParser(String filePath) {
        this();
        initParser(filePath);
    }

    public DocxParser(InputStream is) {
        this();
        initParser(is);
    }

    public void initParser(String filePath) {
        this.path = filePath;
        try {
            FileInputStream fis = new FileInputStream(filePath);
            this.initParser(fis);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }

    public void initParser(InputStream is) {
        theDoc = readFile(is);
    }

    private XWPFDocument readFile(InputStream is) {
        if (is != null) {
            try {
                XWPFDocument doc = new XWPFDocument(OPCPackage.open(is));
                is.close();
                return doc;
            } catch (InvalidFormatException | IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }

    /**
     * Parses the given .doc Word file and returns its text.
     * @param filePath
     * @return
     */
    public static String parseDocSimple(String filePath) {
        FileInputStream fis;
        try {
            fis = new FileInputStream(filePath);
            String text = DocxParser.parseDocSimple(fis);
            fis.close();
            return text;
        } catch (FileNotFoundException e) {
            System.out.println("DocxParser: File not found at " + filePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Parses the given .doc Word file and returns its text.
     * @param is
     * @return
     */
    public static String parseDocSimple(InputStream is) {
        try {
            WordExtractor extr = new WordExtractor(is);
            String text = extr.getText();
            extr.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Only use this method if you don't want to get chapters sometimes.
     * Otherwise use 'parseDocxAndChapters' and 'getFullText' methods.
     * 
     * Parses a .docx Word file and returns its text.
     * @return
     *       Returns the full text (incl. tables) as string.
     */
    public String parseDocxSimple() {
        if (theDoc != null) {
            XWPFWordExtractor extr = new XWPFWordExtractor(theDoc);
            this.fullText = extr.getText();
            try {
                extr.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return this.fullText;
    }

    /**
     * Parses the .docx Word file. After that, you can get its full text and title.
     * And also its chapters and their names and texts.
     */
    public void parseDocxAndChapters() {
        if (theDoc != null) {
            this.chapters = new ArrayList<List<IBodyElement>>();
            this.chapterTexts = new ArrayList<String>();
            this.chapterHeadlines = new ArrayList<String>();
            this.chapterHeadlines.add(0, "0:");
            StringBuffer buf = new StringBuffer();

            List<IBodyElement> tempChapter = new ArrayList<IBodyElement>();
            StringBuffer chapBuf = new StringBuffer();

            String tempText = "";

            int chapterCount = 1;
            for (IBodyElement el : theDoc.getBodyElements()) {

                switch (el.getElementType().toString().toLowerCase()) {
                case ("paragraph"):
                    tempText = ((XWPFParagraph) el).getText();

                    if (CHAPTER_STYLES.contains(((XWPFParagraph) el).getStyle())) {
                        // if first chapter & first paragraph & headline
                        if (chapterCount == 1 && chapBuf.length() == 0) {
                            // add headline for first chapter
                            chapterHeadlines.add(0, (chapterCount) + ": '" + tempText + "'");
                        } else {
                            // save former chapter:
                            chapters.add(tempChapter);
                            chapterTexts.add(chapBuf.toString());
                            buf.append(chapBuf);

                            // prepare for next chapter:
                            tempChapter = new ArrayList<IBodyElement>();
                            chapBuf = new StringBuffer();
                            // add headline for next/following chapter
                            chapterHeadlines.add((chapterCount) + ": '" + tempText + "'");
                        }
                        chapterCount++;
                    }
                    break;
                case ("table"):
                    tempText = ((XWPFTable) el).getText();
                    break;
                default:
                    tempText = "";
                }

                chapBuf.append(tempText + " ");
                tempChapter.add(el);
            }
            // add last chapter:
            chapters.add(tempChapter);
            chapterTexts.add(chapBuf.toString());

            this.fullText = buf.toString();
        }
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     *       Returns a list with the fulltext as first string 
     *       and the chapters' texts afterwards.
     */
    public List<String> getFullTextAndChapterTexts() {
        List<String> list = new ArrayList<String>();
        list.add(fullText);
        list.addAll(this.chapterTexts);
        return list;
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     */
    public List<String> getChapterHeadlines() {
        return this.chapterHeadlines;
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     */
    public List<String> getChapterTexts() {
        return this.chapterTexts;
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     *       Returns the full text (incl. tables) as string.
     */
    public String getFullText() {
        return this.fullText;
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     */
    public List<XWPFParagraph> getParagraphs() {
        return ((XWPFDocument) theDoc).getParagraphs();
    }

    /**
     * First use 'parseDocxAndChapters' method.
     * @return
     */
    public String getTitle() {
        String title = theDoc.getProperties().getCoreProperties().getTitle();

        if (title != null)
            return title;
        else if (this.path != null)
            return this.path;
        else
            return "";
    }
}