org.olat.search.service.document.file.WordOOXMLDocument.java Source code

Java tutorial

Introduction

Here is the source code for org.olat.search.service.document.file.WordOOXMLDocument.java

Source

/**
 * OLAT - Online Learning and Training<br>
 * http://www.olat.org
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License"); <br>
 * you may not use this file except in compliance with the License.<br>
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing,<br>
 * software distributed under the License is distributed on an "AS IS" BASIS, <br>
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
 * See the License for the specific language governing permissions and <br>
 * limitations under the License.
 * <p>
 * Copyright (c) frentix GmbH<br>
 * http://www.frentix.com<br>
 * <p>
 */

package org.olat.search.service.document.file;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;

import org.apache.lucene.document.Document;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.xmlbeans.XmlException;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.search.service.SearchResourceContext;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;

/**
 * Description:<br>
 * Parse the Word XML document (.docx) with Apache POI
 * <P>
 * Initial Date: 14 dec. 2009 <br>
 * 
 * @author srosse, stephane.rosse@frentix.com
 */
public class WordOOXMLDocument extends FileDocument {
    private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class);

    public final static String FILE_TYPE = "type.file.word";

    public WordOOXMLDocument() {
        super();
    }

    public static Document createDocument(final SearchResourceContext leafResourceContext, final VFSLeaf leaf)
            throws IOException, DocumentException, DocumentAccessException {
        final WordOOXMLDocument wordDocument = new WordOOXMLDocument();
        wordDocument.init(leafResourceContext, leaf);
        wordDocument.setFileType(FILE_TYPE);
        wordDocument.setCssIcon("b_filetype_doc");
        if (log.isDebug()) {
            log.debug(wordDocument.toString());
        }
        return wordDocument.getLuceneDocument();
    }

    @Override
    protected String readContent(final VFSLeaf leaf) throws IOException, DocumentException {
        BufferedInputStream bis = null;
        final StringBuilder buffy = new StringBuilder();
        try {
            bis = new BufferedInputStream(leaf.getInputStream());
            final POIXMLTextExtractor extractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(bis);
            final POIXMLDocument document = extractor.getDocument();

            if (document instanceof XWPFDocument) {
                final XWPFDocument xDocument = (XWPFDocument) document;
                final XWPFHeaderFooterPolicy hfPolicy = xDocument.getHeaderFooterPolicy();
                extractHeaders(buffy, hfPolicy);
                extractContent(buffy, xDocument);
                extractFooters(buffy, hfPolicy);
            }

            return buffy.toString();
        } catch (final Exception e) {
            throw new DocumentException(e.getMessage());
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
    }

    private void extractContent(final StringBuilder buffy, final XWPFDocument document)
            throws IOException, XmlException {
        // first all paragraphs
        final Iterator<XWPFParagraph> i = document.getParagraphsIterator();
        while (i.hasNext()) {
            final XWPFParagraph paragraph = i.next();
            CTSectPr ctSectPr = null;
            if (paragraph.getCTP().getPPr() != null) {
                ctSectPr = paragraph.getCTP().getPPr().getSectPr();
            }

            XWPFHeaderFooterPolicy headerFooterPolicy = null;
            if (ctSectPr != null) {
                headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
                extractHeaders(buffy, headerFooterPolicy);
            }

            final XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                    new XWPFHyperlinkDecorator(paragraph, null, true));

            final CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
            for (final CTBookmark bookmark : bookmarks) {
                buffy.append(bookmark.getName()).append(' ');
            }

            buffy.append(decorator.getText()).append(' ');

            if (ctSectPr != null) {
                extractFooters(buffy, headerFooterPolicy);
            }
        }
    }

    private void extractFooters(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
        if (hfPolicy.getFirstPageFooter() != null) {
            buffy.append(hfPolicy.getFirstPageFooter().getText()).append(' ');
        }
        if (hfPolicy.getEvenPageFooter() != null) {
            buffy.append(hfPolicy.getEvenPageFooter().getText()).append(' ');
        }
        if (hfPolicy.getDefaultFooter() != null) {
            buffy.append(hfPolicy.getDefaultFooter().getText()).append(' ');
        }
    }

    private void extractHeaders(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
        if (hfPolicy.getFirstPageHeader() != null) {
            buffy.append(hfPolicy.getFirstPageHeader().getText()).append(' ');
        }
        if (hfPolicy.getEvenPageHeader() != null) {
            buffy.append(hfPolicy.getEvenPageHeader().getText()).append(' ');
        }
        if (hfPolicy.getDefaultHeader() != null) {
            buffy.append(hfPolicy.getDefaultHeader().getText()).append(' ');
        }
    }
}