Example usage for org.apache.poi.xwpf.usermodel XWPFParagraph getText

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.usermodel XWPFParagraph getText.

Prototype

public String getText()

Source Link

Document

Return the textual content of the paragraph, including text from pictures and sdt elements in it.

Usage

From source file:org.ArticleEditor.OptionsView.MenuOptionsTopComponent.java

public int getPosWord(Vector words, int NumParagraphs, XWPFDocument Document) {
    int Pos = -1;
    int i;/*from   w ww.j  a va 2 s  .c  o m*/
    if (NumParagraphs == 1 && Document.getParagraphArray(0).equals("")) {
        return Pos;
    } else {
        Pos = 0;
        for (i = 0; i < NumParagraphs; i++) {
            XWPFParagraph parrafo = Document.getParagraphArray(i);
            String Parrafo = parrafo.getText().toLowerCase();
            int longitud = words.get(0).toString().length() + 1;
            if (isWord(words, Parrafo) == true && parrafo.getText().length() <= longitud) {
                Pos = Pos + parrafo.getText().length();
                i = NumParagraphs;
            } else {
                Pos = Pos + parrafo.getText().length() + 1;
            }
        }
        if (i == NumParagraphs) {
            return -1;
        } else {
            return Pos;
        }
    }
}

From source file:org.knime.ext.textprocessing.nodes.source.parser.word.WordDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    POIFSFileSystem poifs = null;//from   w  w w.ja  v a  2s  . c om
    HWPFDocument hdoc = null;
    XWPFDocument hdoc2 = null;
    WordExtractor extractor = null;

    try {
        // doc files
        if (m_docPath.endsWith(".doc")) {
            // copy content of input stream into byte array since content have to be red twice unfortunately.
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final byte[] buf = new byte[1024];
            int i = 0;
            while ((i = is.read(buf)) >= 0) {
                baos.write(buf, 0, i);
            }
            final byte[] content = baos.toByteArray();

            // open stream with copied content to read text
            InputStream copiedInput = new ByteArrayInputStream(content);
            hdoc = new HWPFDocument(copiedInput);
            extractor = new WordExtractor(hdoc);
            for (String p : extractor.getParagraphText()) {
                p = p.trim();
                if (!onlyWhitepscaes(p)) {
                    m_currentDoc.addParagraph(p);
                }
            }

            // open stream again with copied content to read meta info
            copiedInput = new ByteArrayInputStream(content);
            poifs = new POIFSFileSystem(copiedInput);
            final DirectoryEntry dir = poifs.getRoot();
            final DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
            final PropertySet ps = new PropertySet(new DocumentInputStream(siEntry));

            final SummaryInformation si = new SummaryInformation(ps);

            setAuthor(si.getAuthor());
            setPublicationDate(si.getCreateDateTime());

            // docx files
        } else if (m_docPath.endsWith(".docx") || m_docPath.endsWith(".docm")) {
            hdoc2 = new XWPFDocument(is);
            final List<XWPFParagraph> paragraphs = hdoc2.getParagraphs();
            for (final XWPFParagraph paragraph : paragraphs) {
                final String text = paragraph.getText();
                if (!onlyWhitepscaes(text)) {
                    m_currentDoc.addParagraph(text);
                }
            }

            setAuthor(hdoc2.getProperties().getCoreProperties().getCreator());
            setPublicationDate(hdoc2.getProperties().getCoreProperties().getCreated());
        }

        m_currentDoc.createNewSection(SectionAnnotation.CHAPTER);

        // find title
        String title = null;

        if (m_filenameAsTitle) {
            title = m_docPath.trim();
        } else {
            final List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed word document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        if (!checkTitle(title)) {
            title = m_docPath.toString();
        }
        m_currentDoc.addTitle(title);

        return m_currentDoc.createDocument();
    } finally {
        is.close();
        if (poifs != null) {
            poifs.close();
        }
        if (hdoc != null) {
            hdoc.close();
        }
        if (hdoc2 != null) {
            hdoc2.close();
        }
        if (extractor != null) {
            extractor.close();
        }
    }
}

From source file:org.obeonetwork.m2doc.generator.test.TableClientProcessorTest.java

License:Open Source License

protected void checkParagraph(XWPFParagraph paragraph, String expectedTitle) {
    assertEquals(expectedTitle, paragraph.getText());
    assertEquals(0, paragraph.getSpacingAfter());
    List<XWPFRun> runs = paragraph.getRuns();
    assertEquals(1, runs.size());//from   w  w  w .  j a  va  2s. c  o m
}

From source file:org.shareok.data.documentProcessor.WordHandler.java

private String[] readDocxFile(FileInputStream fs) throws IOException {

    String[] paragraphs = null;//ww  w.  j  av a  2  s.  c  o m
    try {
        //            XWPFDocument doc = new XWPFDocument();
        //            XWPFParagraph p1 = doc.createParagraph();
        //        p1.setAlignment(ParagraphAlignment.CENTER);
        //        p1.setBorderBottom(Borders.DOUBLE);
        //        p1.setBorderTop(Borders.DOUBLE);
        //
        //        p1.setBorderRight(Borders.DOUBLE);
        //        p1.setBorderLeft(Borders.DOUBLE);
        //        p1.setBorderBetween(Borders.SINGLE);
        //
        //        p1.setVerticalAlignment(TextAlignment.TOP);
        //
        //        XWPFRun r1 = p1.createRun();
        //        r1.setBold(true);
        //        r1.setText("The quick brown fox");
        //        r1.setBold(true);
        //        r1.setFontFamily("Courier");
        //        r1.setUnderline(UnderlinePatterns.DOT_DOT_DASH);
        //        r1.setTextPosition(100);
        //
        //        XWPFParagraph p2 = doc.createParagraph();
        //        p2.setAlignment(ParagraphAlignment.RIGHT);
        //
        //        //BORDERS
        //        p2.setBorderBottom(Borders.DOUBLE);
        //        p2.setBorderTop(Borders.DOUBLE);
        //        p2.setBorderRight(Borders.DOUBLE);
        //        p2.setBorderLeft(Borders.DOUBLE);
        //        p2.setBorderBetween(Borders.SINGLE);
        //
        //        XWPFRun r2 = p2.createRun();
        //        r2.setText("jumped over the lazy dog");
        //        r2.setStrike(true);
        //        r2.setFontSize(20);
        //
        //        XWPFRun r3 = p2.createRun();
        //        r3.setText("and went away");
        //        r3.setStrike(true);
        //        r3.setFontSize(20);
        //        r3.setSubscript(VerticalAlign.SUPERSCRIPT);
        //
        //
        //        XWPFParagraph p3 = doc.createParagraph();
        //        p3.setWordWrap(true);
        //        p3.setPageBreak(true);
        //                
        //        //p3.setAlignment(ParagraphAlignment.DISTRIBUTE);
        //        p3.setAlignment(ParagraphAlignment.BOTH);
        //        p3.setSpacingLineRule(LineSpacingRule.EXACT);
        //
        //        p3.setIndentationFirstLine(600);
        //        
        //
        //        XWPFRun r4 = p3.createRun();
        //        r4.setTextPosition(20);
        //        r4.setText("To be, or not to be: that is the question: "
        //                + "Whether 'tis nobler in the mind to suffer "
        //                + "The slings and arrows of outrageous fortune, "
        //                + "Or to take arms against a sea of troubles, "
        //                + "And by opposing end them? To die: to sleep; ");
        //        r4.addBreak(BreakType.PAGE);
        //        r4.setText("No more; and by a sleep to say we end "
        //                + "The heart-ache and the thousand natural shocks "
        //                + "That flesh is heir to, 'tis a consummation "
        //                + "Devoutly to be wish'd. To die, to sleep; "
        //                + "To sleep: perchance to dream: ay, there's the rub; "
        //                + ".......");
        //        r4.setItalic(true);
        ////This would imply that this break shall be treated as a simple line break, and break the line after that word:
        //
        //        XWPFRun r5 = p3.createRun();
        //        r5.setTextPosition(-10);
        //        r5.setText("For in that sleep of death what dreams may come");
        //        r5.addCarriageReturn();
        //        r5.setText("When we have shuffled off this mortal coil,"
        //                + "Must give us pause: there's the respect"
        //                + "That makes calamity of so long life;");
        //        r5.addBreak();
        //        r5.setText("For who would bear the whips and scorns of time,"
        //                + "The oppressor's wrong, the proud man's contumely,");
        //        
        //        r5.addBreak(BreakClear.ALL);
        //        r5.setText("The pangs of despised love, the law's delay,"
        //                + "The insolence of office and the spurns" + ".......");
        //
        //        FileOutputStream out = new FileOutputStream("simple.docx");
        //        doc.write(out);
        //        out.close();
        XWPFDocument document = new XWPFDocument(OPCPackage.open("simple.docx"));
        List<XWPFParagraph> paragraphList = document.getParagraphs();
        paragraphs = new String[paragraphList.size()];
        int i = 0;
        for (XWPFParagraph para : paragraphList) {
            paragraphs[i] = para.getText();
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        fs.close();
    }
    return paragraphs;
}

From source file:util.DocumentFunction.java

public static String readDocxFile(String fileName) {
    StringBuilder text = new StringBuilder();
    try {/*from   w  w  w . jav  a2s.c  o m*/
        File file = new File(fileName);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        XWPFDocument document = new XWPFDocument(fis);

        List<XWPFParagraph> paragraphs = document.getParagraphs();

        //System.out.println("Total no of paragraph "+paragraphs.size());
        for (XWPFParagraph para : paragraphs) {
            text.append(para.getText() + "\n");
        }
        fis.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return text.toString();
}