de.powerstaff.business.service.impl.reader.msword.DOCXWordDocumentReader.java Source code

Java tutorial

Introduction

Here is the source code for de.powerstaff.business.service.impl.reader.msword.DOCXWordDocumentReader.java

Source

/**
 * Mogwai PowerStaff. Copyright (C) 2002 The Mogwai Project.
 *
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
package de.powerstaff.business.service.impl.reader.msword;

import de.powerstaff.business.service.impl.reader.AbstractDocumentReader;
import de.powerstaff.business.service.impl.reader.ReadResult;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.File;
import java.io.FileInputStream;

/**
 * DocumentReader for microsoft word documents.
 *
 * @author sertic
 */
public class DOCXWordDocumentReader extends AbstractDocumentReader {

    public ReadResult getContent(File inputFile) throws Exception {

        XWPFDocument theDocument = new XWPFDocument(new FileInputStream(inputFile));
        XWPFWordExtractor theExtractor = new XWPFWordExtractor(theDocument);
        String theText = theExtractor.getText();
        theText = theText.replace('|', ' ');
        return new ReadResult(toFlatString(theText));
    }
}