List of usage examples for org.apache.poi.xwpf.usermodel XWPFDocument getParagraphsIterator
public Iterator<XWPFParagraph> getParagraphsIterator()
From source file:com.maxl.java.aips2sqlite.PseudoExpertInfo.java
License:Open Source License
/** * Extracts all the important information from the pseudo "Fachinfo" file * @param pseudo_info_file//from www . ja v a 2 s . co m */ public boolean extractInfo(int idx, FileInputStream pseudo_info_file) { mMedi = new MedicalInformations.MedicalInformation(); mSectionContent = new ArrayList<String>(); mSectionTitles = new ArrayList<String>(); mBarCodes = new ArrayList<String>(); m_list_of_packages = new ArrayList<String>(); String mediTitle = ""; String mediAuthor = ""; String mediPseudoTag = ""; String mediHtmlContent = ""; StringBuilder content = new StringBuilder(); try { // Read in docx file XWPFDocument docx = new XWPFDocument(pseudo_info_file); // Get iterator through all paragraphs Iterator<XWPFParagraph> para = docx.getParagraphsIterator(); // Pre-process input stream to extract paragraph titles boolean goodToGo = false; while (para.hasNext()) { List<XWPFRun> runs = para.next().getRuns(); if (!runs.isEmpty()) { for (XWPFRun r : runs) { // bold and italics identifies section title! if (r.isBold()) { // && r.isItalic()) { String pText = r.getParagraph().getText(); // These are the first chapter titles (DE and FR) if (pText.equals("Zusammensetzung") || pText.equals("Composition")) goodToGo = true; if (goodToGo == true) mSectionTitles.add(pText); } } } } // Add "nil" at the end mSectionTitles.add("nil"); if (mLanguage.equals("de") && !mSectionTitles.get(0).equals("Zusammensetzung")) return false; if (mLanguage.equals("fr") && !mSectionTitles.get(0).equals("Composition")) return false; // Reset iterator para = docx.getParagraphsIterator(); // Init list for section content for (int i = 0; i < mSectionTitles.size(); ++i) mSectionContent.add(i, ""); // Get title if (para.hasNext()) mediTitle = para.next().getParagraphText(); // Get author while using "Medizinprodukt" as tag String prevParaText = ""; while (para.hasNext()) { String paraText = para.next().getParagraphText(); // If this word is not found, then no pseudo FI will be produced if (paraText.equals("Medizinprodukt") || paraText.equals("Dispositif mdical")) { mediPseudoTag = paraText; mediAuthor = prevParaText; break; } prevParaText = paraText; } // Get section titles + sections + ean codes boolean isSectionPackungen = false; int numSection = 0; // Init with section1 and title String sectionId_str = ""; String sectionTitle_str = ""; mEanCodes_str = ""; mSectionIds_str = "section1,"; mSectionTitles_str = mediTitle + ","; m_pack_info_str = ""; // This is the EAN code pattern Pattern pattern = Pattern.compile("^[0-9]{13}"); // Loop through it, identifying medication title, author, section titles and corresponding titles while (para.hasNext()) { String paraText = para.next().getParagraphText(); if (paraText.equals(mSectionTitles.get(numSection))) { // ->> Get section title isSectionPackungen = false; // Get section title if (numSection < mSectionTitles.size()) numSection++; // Section "Packungen" is special if (paraText.equals("Packungen") || paraText.equals("Prsentation")) { isSectionPackungen = true; } // Close previous div if (numSection > 1) content.append("</div>"); // Create html sectionId_str = "section" + (numSection + 1); // section1 is reserved for the MonTitle sectionTitle_str = mSectionTitles.get(numSection - 1); content.append("<div class=\"paragraph\" id=\"" + sectionId_str + "\">"); content.append("<div class=\"absTitle\">" + sectionTitle_str + "</div>"); // Generate section id string mSectionIds_str += (sectionId_str + ","); // Generate titles string mSectionTitles_str += (sectionTitle_str + ";"); } else { // ->> Get section content String s = mSectionContent.get(numSection - 1); mSectionContent.set(numSection - 1, s + paraText + " "); // Create html content.append("<p class=\"spacing1\">" + paraText + "</p>"); // Extract EAN codes and start positions Matcher matcher = pattern.matcher(paraText); while (matcher.find()) { String eanCode = matcher.group(); mEanCodes_str += (eanCode + ", "); if (!eanCode.isEmpty()) { String pup = ""; String efp = ""; String fep = ""; String fap = ""; String vat = ""; String size = ""; String units = ""; String swissmedic_cat = ""; String pharma_code = ""; int visible = 0xff; int has_free_samples = 0x00; // by default no free samples // Exctract fep and fap pricing information // FAP = Fabrikabgabepreis = EFP? // FEP = Fachhandelseinkaufspreis // EFP = FAP < FEP < PUP if (m_map_products != null && eanCode != null && m_map_products.containsKey(eanCode)) { Product product = m_map_products.get(eanCode); if (product.efp > 0.0f) efp = String.format("CHF %.2f", product.efp); if (product.pp > 0.0f) pup = String.format("CHF %.2f", product.pp); if (product.fap > 0.0f) fap = String.format("CHF %.2f", product.fap); if (product.fep > 0.0f) fep = String.format("CHF %.2f", product.fep); if (product.vat > 0.0f) vat = String.format("%.2f", product.vat); if (product.size != null && !product.size.isEmpty()) size = product.size; if (product.units != null && product.units.length > 0) units = product.units[0]; if (product.swissmedic_cat != null && !product.swissmedic_cat.isEmpty()) swissmedic_cat = product.swissmedic_cat; if (product.pharmacode != null && !product.pharmacode.isEmpty()) pharma_code = product.pharmacode; visible = product.visible; has_free_samples = product.free_sample; } m_list_of_packages.add(mediTitle.toUpperCase() + ", " + units + ", " + size + "|" + size + "|" + units + "|" + efp + "|" + pup + "|" + fap + "|" + fep + "|" + vat + "|" + swissmedic_cat + ",,|" + eanCode + "|" + pharma_code + "|" + visible + "|" + has_free_samples + "\n"); // Generate bar codes BarCode bc = new BarCode(); String barcodeImg64 = bc.encode(eanCode); mBarCodes.add("<p class=\"spacing1\">" + barcodeImg64 + "</p>"); content.append(barcodeImg64); } } // Generate section Packungen for search result if (isSectionPackungen) m_pack_info_str += (paraText + "\n"); } } /* // Add chapter "Barcodes" content.append("<p class=\"paragraph\"></p><div class=\"absTitle\">" + "Barcodes" + "</div>"); for (String bcode : mBarCodes) content.append(bcode); */ // Remove last comma from mEanCodes_str if (!mEanCodes_str.isEmpty()) mEanCodes_str = mEanCodes_str.substring(0, mEanCodes_str.length() - 2); // Remove last \n from mSectionPackungen_str if (!m_pack_info_str.isEmpty()) m_pack_info_str = m_pack_info_str.substring(0, m_pack_info_str.length() - 1); // Set title, autor mMedi.setTitle(mediTitle); mMedi.setAuthHolder(mediAuthor); mMedi.setAtcCode("PSEUDO"); mMedi.setSubstances(mediTitle); System.out.println(idx + " - " + mediTitle + ": " + mEanCodes_str); // Close previous div + monographie div content.append("</div></div>"); String title = "<div class=\"MonTitle\" id=\"section1\">" + mediTitle + "</div>"; String author = "<div class=\"ownerCompany\"><div style=\"text-align: right;\">" + mediAuthor + "</div></div>"; // Set "Medizinprodukt" label String pseudo = "<p class=\"spacing1\">" + mediPseudoTag + "</p>"; // Set medi content mediHtmlContent = "<html><head></head><body><div id=\"monographie\">" + title + author + pseudo + content.toString() + "</div></body></html>"; // Generate clean html file Document doc = Jsoup.parse(mediHtmlContent); doc.outputSettings().escapeMode(EscapeMode.xhtml); doc.outputSettings().charset("UTF-8"); doc.outputSettings().prettyPrint(true); doc.outputSettings().indentAmount(1); mediHtmlContent = doc.html(); // Set html content mMedi.setContent(mediHtmlContent); // Add to DB addToDB(); return true; } catch (IOException e) { e.printStackTrace(); return false; } }
From source file:org.olat.search.service.document.file.WordOOXMLDocument.java
License:Apache License
private void extractContent(final StringBuilder buffy, final XWPFDocument document) throws IOException, XmlException { // first all paragraphs final Iterator<XWPFParagraph> i = document.getParagraphsIterator(); while (i.hasNext()) { final XWPFParagraph paragraph = i.next(); CTSectPr ctSectPr = null;/* w w w . j a v a 2 s.co m*/ if (paragraph.getCTP().getPPr() != null) { ctSectPr = paragraph.getCTP().getPPr().getSectPr(); } XWPFHeaderFooterPolicy headerFooterPolicy = null; if (ctSectPr != null) { headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); extractHeaders(buffy, headerFooterPolicy); } final XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( new XWPFHyperlinkDecorator(paragraph, null, true)); final CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray(); for (final CTBookmark bookmark : bookmarks) { buffy.append(bookmark.getName()).append(' '); } buffy.append(decorator.getText()).append(' '); if (ctSectPr != null) { extractFooters(buffy, headerFooterPolicy); } } }