de.catma.document.source.contenthandler.DOCXContentHandler.java Source code

Java tutorial

Introduction

Here is the source code for de.catma.document.source.contenthandler.DOCXContentHandler.java

Source

/*   
 *   CATMA Computer Aided Text Markup and Analysis
 *   
 *   Copyright (C) 2009-2013  University Of Hamburg
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.catma.document.source.contenthandler;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import de.catma.document.source.FileOSType;

/**
 * A content handler for MS Word Docx based {@link de.catma.document.source.SourceDocument}s.
 * 
 * @author marco.petris@web.de
 *
 */
public class DOCXContentHandler extends AbstractSourceContentHandler {

    /* (non-Javadoc)
     * @see de.catma.document.source.contenthandler.SourceContentHandler#load(java.io.InputStream)
     */
    @Override
    public void load(InputStream is) throws IOException {
        XWPFDocument doc = new XWPFDocument(is);
        XWPFWordExtractor wordExtractor = new XWPFWordExtractor(doc);
        String buf = wordExtractor.getText();

        //it's still microsoft after all
        if (FileOSType.getFileOSType(buf).equals(FileOSType.UNIX)) {
            buf = FileOSType.convertUnixToDos(buf);
        }

        setContent(buf);
    }

    /* (non-Javadoc)
     * @see de.catma.document.source.contenthandler.SourceContentHandler#load()
     */
    @Override
    public void load() throws IOException {
        BufferedInputStream bis = null;
        try {

            bis = new BufferedInputStream(getSourceDocumentInfo().getTechInfoSet().getURI().toURL().openStream());

            load(bis);
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
    }

}