org.wandora.utils.MSOfficeBox.java Source code

Introduction

Here is the source code for org.wandora.utils.MSOfficeBox.java
Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 * 
 *
 * MSOfficeBox.java
 *
 * Created on 13. keskuuta 2006, 14:20
 *
 */

package org.wandora.utils;

import java.io.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Iterator;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.poi.*;
import org.apache.poi.extractor.*;
import org.apache.poi.hslf.extractor.*;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**
 * Class to extract the text from MS office documents.
 * Based on Apache's POI framework
 *
 * @author akivela
 */

public class MSOfficeBox {

    /**
     * Creates a new instance of MSOfficeBox
     */
    public MSOfficeBox() {
    }

    // ----------------------------------------------------------- WORD TEXT ---

    public static String getWordTextOld(InputStream is) {
        try {
            return getWordTextOld(new HWPFDocument(is));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Get the text from the word file, as an array with one String
     *  per paragraph
     */
    public static String[] getWordParagraphText(HWPFDocument doc) {
        String[] ret;

        // Extract using the model code
        try {
            Range r = doc.getRange();

            ret = new String[r.numParagraphs()];
            for (int i = 0; i < ret.length; i++) {
                Paragraph p = r.getParagraph(i);
                ret[i] = p.text();

                // Fix the line ending
                if (ret[i].endsWith("\r")) {
                    ret[i] = ret[i] + "\n";
                }
            }
        } catch (Exception e) {
            // Something's up with turning the text pieces into paragraphs
            // Fall back to ripping out the text pieces
            ret = new String[1];
            ret[0] = getWordTextFromPieces(doc);
        }

        return ret;
    }

    /**
     * Grab the text out of the text pieces. Might also include various
     *  bits of crud, but will work in cases where the text piece -> paragraph
     *  mapping is broken. Fast too.
     */
    public static String getWordTextFromPieces(HWPFDocument doc) {
        StringBuilder textBuf = new StringBuilder();

        Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
        while (textPieces.hasNext()) {
            TextPiece piece = (TextPiece) textPieces.next();

            String encoding = "Cp1252";
            if (piece.isUnicode()) {
                encoding = "UTF-16LE";
            }
            try {
                String text = new String(piece.getRawBytes(), encoding);
                textBuf.append(text);
            } catch (UnsupportedEncodingException e) {
                throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
            }
        }

        String text = textBuf.toString();

        // Fix line endings (Note - won't get all of them
        text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
        text = text.replaceAll("\r\r", "\r\n\r\n");

        if (text.endsWith("\r")) {
            text += "\n";
        }

        return text;
    }

    /**
     * Grab the text, based on the paragraphs. Shouldn't include any crud,
     *  but slightly slower than getTextFromPieces().
     */
    public static String getWordTextOld(HWPFDocument doc) {
        StringBuilder ret = new StringBuilder();
        String[] text = getWordParagraphText(doc);
        for (int i = 0; i < text.length; i++) {
            ret.append(text[i]);
        }
        return ret.toString();
    }

    public static String getWordText(InputStream is) {
        try {
            WordExtractor extractor = new WordExtractor(is);
            return extractor.getText();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    // ---------------------------------------------------------- PowerPoint ---

    public static String getPowerPointText(InputStream is) {
        try {
            PowerPointExtractor extractor = new PowerPointExtractor(is);
            return extractor.getText(true, true);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    // ----------------------------------------------------------------- Any ---

    public static String getText(URL url) {
        try {
            return getText(url.openStream());
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static String getDocxText(File file) {
        try {
            XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
            XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
            String text = extractor.getText();
            return text;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static String getText(InputStream is) {
        try {
            POITextExtractor extractor = ExtractorFactory.createExtractor(is);
            return extractor.getText();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static String getText(File f) {
        try {
            POITextExtractor extractor = ExtractorFactory.createExtractor(f);
            return extractor.getText();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }
}