Example usage for org.apache.poi.xwpf.usermodel XWPFDocument getParagraphsIterator

List of usage examples for org.apache.poi.xwpf.usermodel XWPFDocument getParagraphsIterator

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.usermodel XWPFDocument getParagraphsIterator.

Prototype

public Iterator<XWPFParagraph> getParagraphsIterator() 

Source Link

Usage

From source file:com.maxl.java.aips2sqlite.PseudoExpertInfo.java

License:Open Source License

/**
 * Extracts all the important information from the pseudo "Fachinfo" file
 * @param pseudo_info_file//from   www . ja  v  a 2  s  .  co  m
 */
public boolean extractInfo(int idx, FileInputStream pseudo_info_file) {
    mMedi = new MedicalInformations.MedicalInformation();

    mSectionContent = new ArrayList<String>();
    mSectionTitles = new ArrayList<String>();
    mBarCodes = new ArrayList<String>();
    m_list_of_packages = new ArrayList<String>();

    String mediTitle = "";
    String mediAuthor = "";
    String mediPseudoTag = "";
    String mediHtmlContent = "";

    StringBuilder content = new StringBuilder();

    try {
        // Read in docx file
        XWPFDocument docx = new XWPFDocument(pseudo_info_file);
        // Get iterator through all paragraphs
        Iterator<XWPFParagraph> para = docx.getParagraphsIterator();

        // Pre-process input stream to extract paragraph titles
        boolean goodToGo = false;
        while (para.hasNext()) {
            List<XWPFRun> runs = para.next().getRuns();
            if (!runs.isEmpty()) {
                for (XWPFRun r : runs) {
                    // bold and italics identifies section title!
                    if (r.isBold()) { // && r.isItalic()) {
                        String pText = r.getParagraph().getText();
                        // These are the first chapter titles (DE and FR)
                        if (pText.equals("Zusammensetzung") || pText.equals("Composition"))
                            goodToGo = true;
                        if (goodToGo == true)
                            mSectionTitles.add(pText);
                    }
                }
            }
        }
        // Add "nil" at the end
        mSectionTitles.add("nil");

        if (mLanguage.equals("de") && !mSectionTitles.get(0).equals("Zusammensetzung"))
            return false;
        if (mLanguage.equals("fr") && !mSectionTitles.get(0).equals("Composition"))
            return false;

        // Reset iterator
        para = docx.getParagraphsIterator();

        // Init list for section content 
        for (int i = 0; i < mSectionTitles.size(); ++i)
            mSectionContent.add(i, "");

        // Get title
        if (para.hasNext())
            mediTitle = para.next().getParagraphText();
        // Get author while using "Medizinprodukt" as tag
        String prevParaText = "";
        while (para.hasNext()) {
            String paraText = para.next().getParagraphText();
            // If this word is not found, then no pseudo FI will be produced
            if (paraText.equals("Medizinprodukt") || paraText.equals("Dispositif mdical")) {
                mediPseudoTag = paraText;
                mediAuthor = prevParaText;
                break;
            }
            prevParaText = paraText;
        }

        // Get section titles + sections + ean codes
        boolean isSectionPackungen = false;
        int numSection = 0;
        // Init with section1 and title
        String sectionId_str = "";
        String sectionTitle_str = "";
        mEanCodes_str = "";
        mSectionIds_str = "section1,";
        mSectionTitles_str = mediTitle + ",";
        m_pack_info_str = "";
        // This is the EAN code pattern
        Pattern pattern = Pattern.compile("^[0-9]{13}");
        // Loop through it, identifying medication title, author, section titles and corresponding titles
        while (para.hasNext()) {
            String paraText = para.next().getParagraphText();
            if (paraText.equals(mSectionTitles.get(numSection))) {
                // ->> Get section title
                isSectionPackungen = false;
                // Get section title
                if (numSection < mSectionTitles.size())
                    numSection++;
                // Section "Packungen" is special
                if (paraText.equals("Packungen") || paraText.equals("Prsentation")) {
                    isSectionPackungen = true;
                }
                // Close previous div
                if (numSection > 1)
                    content.append("</div>");
                // Create html
                sectionId_str = "section" + (numSection + 1); // section1 is reserved for the MonTitle
                sectionTitle_str = mSectionTitles.get(numSection - 1);
                content.append("<div class=\"paragraph\" id=\"" + sectionId_str + "\">");
                content.append("<div class=\"absTitle\">" + sectionTitle_str + "</div>");
                // Generate section id string
                mSectionIds_str += (sectionId_str + ",");
                // Generate titles string
                mSectionTitles_str += (sectionTitle_str + ";");
            } else {
                // ->> Get section content
                String s = mSectionContent.get(numSection - 1);
                mSectionContent.set(numSection - 1, s + paraText + " ");
                // Create html
                content.append("<p class=\"spacing1\">" + paraText + "</p>");
                // Extract EAN codes and start positions
                Matcher matcher = pattern.matcher(paraText);
                while (matcher.find()) {
                    String eanCode = matcher.group();
                    mEanCodes_str += (eanCode + ", ");
                    if (!eanCode.isEmpty()) {
                        String pup = "";
                        String efp = "";
                        String fep = "";
                        String fap = "";
                        String vat = "";
                        String size = "";
                        String units = "";
                        String swissmedic_cat = "";
                        String pharma_code = "";
                        int visible = 0xff;
                        int has_free_samples = 0x00; // by default no free samples
                        // Exctract fep and fap pricing information
                        // FAP = Fabrikabgabepreis = EFP?
                        // FEP = Fachhandelseinkaufspreis
                        // EFP = FAP < FEP < PUP                     
                        if (m_map_products != null && eanCode != null && m_map_products.containsKey(eanCode)) {
                            Product product = m_map_products.get(eanCode);
                            if (product.efp > 0.0f)
                                efp = String.format("CHF %.2f", product.efp);
                            if (product.pp > 0.0f)
                                pup = String.format("CHF %.2f", product.pp);
                            if (product.fap > 0.0f)
                                fap = String.format("CHF %.2f", product.fap);
                            if (product.fep > 0.0f)
                                fep = String.format("CHF %.2f", product.fep);
                            if (product.vat > 0.0f)
                                vat = String.format("%.2f", product.vat);
                            if (product.size != null && !product.size.isEmpty())
                                size = product.size;
                            if (product.units != null && product.units.length > 0)
                                units = product.units[0];
                            if (product.swissmedic_cat != null && !product.swissmedic_cat.isEmpty())
                                swissmedic_cat = product.swissmedic_cat;
                            if (product.pharmacode != null && !product.pharmacode.isEmpty())
                                pharma_code = product.pharmacode;
                            visible = product.visible;
                            has_free_samples = product.free_sample;
                        }
                        m_list_of_packages.add(mediTitle.toUpperCase() + ", " + units + ", " + size + "|" + size
                                + "|" + units + "|" + efp + "|" + pup + "|" + fap + "|" + fep + "|" + vat + "|"
                                + swissmedic_cat + ",,|" + eanCode + "|" + pharma_code + "|" + visible + "|"
                                + has_free_samples + "\n");
                        // Generate bar codes
                        BarCode bc = new BarCode();
                        String barcodeImg64 = bc.encode(eanCode);
                        mBarCodes.add("<p class=\"spacing1\">" + barcodeImg64 + "</p>");
                        content.append(barcodeImg64);
                    }
                }
                // Generate section Packungen for search result
                if (isSectionPackungen)
                    m_pack_info_str += (paraText + "\n");
            }
        }
        /*
        // Add chapter "Barcodes"
        content.append("<p class=\"paragraph\"></p><div class=\"absTitle\">" + "Barcodes" + "</div>");
        for (String bcode : mBarCodes)
           content.append(bcode);
        */
        // Remove last comma from mEanCodes_str
        if (!mEanCodes_str.isEmpty())
            mEanCodes_str = mEanCodes_str.substring(0, mEanCodes_str.length() - 2);
        // Remove last \n from mSectionPackungen_str
        if (!m_pack_info_str.isEmpty())
            m_pack_info_str = m_pack_info_str.substring(0, m_pack_info_str.length() - 1);

        // Set title, autor
        mMedi.setTitle(mediTitle);
        mMedi.setAuthHolder(mediAuthor);
        mMedi.setAtcCode("PSEUDO");
        mMedi.setSubstances(mediTitle);

        System.out.println(idx + " - " + mediTitle + ": " + mEanCodes_str);

        // Close previous div + monographie div
        content.append("</div></div>");
        String title = "<div class=\"MonTitle\" id=\"section1\">" + mediTitle + "</div>";
        String author = "<div class=\"ownerCompany\"><div style=\"text-align: right;\">" + mediAuthor
                + "</div></div>";
        // Set "Medizinprodukt" label
        String pseudo = "<p class=\"spacing1\">" + mediPseudoTag + "</p>";
        // Set medi content         
        mediHtmlContent = "<html><head></head><body><div id=\"monographie\">" + title + author + pseudo
                + content.toString() + "</div></body></html>";

        // Generate clean html file
        Document doc = Jsoup.parse(mediHtmlContent);
        doc.outputSettings().escapeMode(EscapeMode.xhtml);
        doc.outputSettings().charset("UTF-8");
        doc.outputSettings().prettyPrint(true);
        doc.outputSettings().indentAmount(1);
        mediHtmlContent = doc.html();

        // Set html content
        mMedi.setContent(mediHtmlContent);

        // Add to DB
        addToDB();

        return true;
    } catch (IOException e) {
        e.printStackTrace();
        return false;
    }
}

From source file:org.olat.search.service.document.file.WordOOXMLDocument.java

License:Apache License

private void extractContent(final StringBuilder buffy, final XWPFDocument document)
        throws IOException, XmlException {
    // first all paragraphs
    final Iterator<XWPFParagraph> i = document.getParagraphsIterator();
    while (i.hasNext()) {
        final XWPFParagraph paragraph = i.next();
        CTSectPr ctSectPr = null;/*  w w w .  j  a  v  a 2 s.co  m*/
        if (paragraph.getCTP().getPPr() != null) {
            ctSectPr = paragraph.getCTP().getPPr().getSectPr();
        }

        XWPFHeaderFooterPolicy headerFooterPolicy = null;
        if (ctSectPr != null) {
            headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
            extractHeaders(buffy, headerFooterPolicy);
        }

        final XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                new XWPFHyperlinkDecorator(paragraph, null, true));

        final CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
        for (final CTBookmark bookmark : bookmarks) {
            buffy.append(bookmark.getName()).append(' ');
        }

        buffy.append(decorator.getText()).append(' ');

        if (ctSectPr != null) {
            extractFooters(buffy, headerFooterPolicy);
        }
    }
}