Example usage for org.apache.poi.xwpf.usermodel XWPFParagraph getIRuns

List of usage examples for org.apache.poi.xwpf.usermodel XWPFParagraph getIRuns

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.usermodel XWPFParagraph getIRuns.

Prototype

public List<IRunElement> getIRuns() 

Source Link

Document

Return literal runs and sdt/content control objects.

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.java

License:Apache License

private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml)
        throws SAXException, XmlException, IOException {
    // If this paragraph is actually a whole new section, then
    // it could have its own headers and footers
    // Check and handle if so
    XWPFHeaderFooterPolicy headerFooterPolicy = null;
    if (paragraph.getCTP().getPPr() != null) {
        CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
        if (ctSectPr != null) {
            headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
            extractHeaders(xhtml, headerFooterPolicy);
        }/*from   w  w  w  .ja va  2s  .  c  o m*/
    }

    // Is this a paragraph, or a heading?
    String tag = "p";
    String styleClass = null;
    if (paragraph.getStyleID() != null) {
        XWPFStyle style = styles.getStyle(paragraph.getStyleID());

        if (style != null && style.getName() != null) {
            TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
                    paragraph.getPartType() == BodyType.TABLECELL);
            tag = tas.getTag();
            styleClass = tas.getStyleClass();
        }
    }

    if (styleClass == null) {
        xhtml.startElement(tag);
    } else {
        xhtml.startElement(tag, "class", styleClass);
    }

    // Output placeholder for any embedded docs:

    // TODO: replace w/ XPath/XQuery:
    for (XWPFRun run : paragraph.getRuns()) {
        XmlCursor c = run.getCTR().newCursor();
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTObject) {
                XmlCursor c2 = o.newCursor();
                c2.selectPath("./*");
                while (c2.toNextSelection()) {
                    XmlObject o2 = c2.getObject();

                    XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
                    if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
                        // Type is "Embed"
                        XmlObject relIDAtt = o2.selectAttribute(new QName(
                                "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                        if (relIDAtt != null) {
                            String relID = relIDAtt.getDomNode().getNodeValue();
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
                c2.dispose();
            }
        }

        c.dispose();
    }

    // Attach bookmarks for the paragraph
    // (In future, we might put them in the right place, for now
    // we just put them in the correct paragraph)
    for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) {
        xhtml.startElement("a", "name", bookmark.getName());
        xhtml.endElement("a");
    }

    TmpFormatting fmtg = new TmpFormatting(false, false);

    // Do the iruns
    for (IRunElement run : paragraph.getIRuns()) {
        if (run instanceof XWPFSDT) {
            fmtg = closeStyleTags(xhtml, fmtg);
            processSDTRun((XWPFSDT) run, xhtml);
            // for now, we're ignoring formatting in sdt
            // if you hit an sdt reset to false
            fmtg.setBold(false);
            fmtg.setItalic(false);
        } else {
            fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
        }
    }
    closeStyleTags(xhtml, fmtg);

    // Now do any comments for the paragraph
    XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
    String commentText = comments.getCommentText();
    if (commentText != null && commentText.length() > 0) {
        xhtml.characters(commentText);
    }

    String footnameText = paragraph.getFootnoteText();
    if (footnameText != null && footnameText.length() > 0) {
        xhtml.characters(footnameText + "\n");
    }

    // Also extract any paragraphs embedded in text boxes:
    for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
            "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
        extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()),
                xhtml);
    }

    // Finish this paragraph
    xhtml.endElement(tag);

    if (headerFooterPolicy != null) {
        extractFooters(xhtml, headerFooterPolicy);
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.java

License:Apache License

private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml)
        throws SAXException, XmlException, IOException {
    // If this paragraph is actually a whole new section, then
    //  it could have its own headers and footers
    // Check and handle if so
    XWPFHeaderFooterPolicy headerFooterPolicy = null;
    if (paragraph.getCTP().getPPr() != null) {
        CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
        if (ctSectPr != null) {
            headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
            extractHeaders(xhtml, headerFooterPolicy, listManager);
        }/*from w  w w  . j a v  a 2 s . c  o  m*/
    }

    // Is this a paragraph, or a heading?
    String tag = "p";
    String styleClass = null;
    if (paragraph.getStyleID() != null) {
        XWPFStyle style = styles.getStyle(paragraph.getStyleID());

        if (style != null && style.getName() != null) {
            TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
                    paragraph.getPartType() == BodyType.TABLECELL);
            tag = tas.getTag();
            styleClass = tas.getStyleClass();
        }
    }

    if (styleClass == null) {
        xhtml.startElement(tag);
    } else {
        xhtml.startElement(tag, "class", styleClass);
    }

    writeParagraphNumber(paragraph, listManager, xhtml);
    // Output placeholder for any embedded docs:

    // TODO: replace w/ XPath/XQuery:
    for (XWPFRun run : paragraph.getRuns()) {
        XmlCursor c = run.getCTR().newCursor();
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTObject) {
                XmlCursor c2 = o.newCursor();
                c2.selectPath("./*");
                while (c2.toNextSelection()) {
                    XmlObject o2 = c2.getObject();

                    XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
                    if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
                        // Type is "Embed"
                        XmlObject relIDAtt = o2.selectAttribute(new QName(
                                "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                        if (relIDAtt != null) {
                            String relID = relIDAtt.getDomNode().getNodeValue();
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
                c2.dispose();
            }
        }

        c.dispose();
    }

    // Attach bookmarks for the paragraph
    // (In future, we might put them in the right place, for now
    //  we just put them in the correct paragraph)
    for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
        CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
        xhtml.startElement("a", "name", bookmark.getName());
        xhtml.endElement("a");
    }

    TmpFormatting fmtg = new TmpFormatting(false, false);

    // Do the iruns
    for (IRunElement run : paragraph.getIRuns()) {
        if (run instanceof XWPFSDT) {
            fmtg = closeStyleTags(xhtml, fmtg);
            processSDTRun((XWPFSDT) run, xhtml);
            //for now, we're ignoring formatting in sdt
            //if you hit an sdt reset to false
            fmtg.setBold(false);
            fmtg.setItalic(false);
        } else {
            fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
        }
    }
    closeStyleTags(xhtml, fmtg);

    // Now do any comments for the paragraph
    XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
    String commentText = comments.getCommentText();
    if (commentText != null && commentText.length() > 0) {
        xhtml.characters(commentText);
    }

    String footnameText = paragraph.getFootnoteText();
    if (footnameText != null && footnameText.length() > 0) {
        xhtml.characters(footnameText + "\n");
    }

    // Also extract any paragraphs embedded in text boxes:
    for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
            "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
        extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()),
                listManager, xhtml);
    }

    // Finish this paragraph
    xhtml.endElement(tag);

    if (headerFooterPolicy != null) {
        extractFooters(xhtml, headerFooterPolicy, listManager);
    }
}