Example usage for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument

List of usage examples for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument.

Prototype

public XWPFDocument(InputStream is) throws IOException 

Source Link

Usage

From source file:com.jgaap.generics.DocumentHelper.java

License:Open Source License

/**
 * Extracts text from a Word document and stores it in the document.
 * //  w  ww.jav a 2 s .  com
 * @param inputStream
 *            An input stream pointing to the Word document to be read.
 * @throws IOException
 */
static private char[] loadMSWordDocx(InputStream inputStream) throws IOException {
    XWPFDocument docx = new XWPFDocument(inputStream);
    XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
    return extractor.getText().toCharArray();
}

From source file:com.maxl.java.aips2sqlite.PseudoExpertInfo.java

License:Open Source License

/**
 * Extracts all the important information from the pseudo "Fachinfo" file
 * @param pseudo_info_file/*from  w ww . j ava 2 s .  c  om*/
 */
public boolean extractInfo(int idx, FileInputStream pseudo_info_file) {
    mMedi = new MedicalInformations.MedicalInformation();

    mSectionContent = new ArrayList<String>();
    mSectionTitles = new ArrayList<String>();
    mBarCodes = new ArrayList<String>();
    m_list_of_packages = new ArrayList<String>();

    String mediTitle = "";
    String mediAuthor = "";
    String mediPseudoTag = "";
    String mediHtmlContent = "";

    StringBuilder content = new StringBuilder();

    try {
        // Read in docx file
        XWPFDocument docx = new XWPFDocument(pseudo_info_file);
        // Get iterator through all paragraphs
        Iterator<XWPFParagraph> para = docx.getParagraphsIterator();

        // Pre-process input stream to extract paragraph titles
        boolean goodToGo = false;
        while (para.hasNext()) {
            List<XWPFRun> runs = para.next().getRuns();
            if (!runs.isEmpty()) {
                for (XWPFRun r : runs) {
                    // bold and italics identifies section title!
                    if (r.isBold()) { // && r.isItalic()) {
                        String pText = r.getParagraph().getText();
                        // These are the first chapter titles (DE and FR)
                        if (pText.equals("Zusammensetzung") || pText.equals("Composition"))
                            goodToGo = true;
                        if (goodToGo == true)
                            mSectionTitles.add(pText);
                    }
                }
            }
        }
        // Add "nil" at the end
        mSectionTitles.add("nil");

        if (mLanguage.equals("de") && !mSectionTitles.get(0).equals("Zusammensetzung"))
            return false;
        if (mLanguage.equals("fr") && !mSectionTitles.get(0).equals("Composition"))
            return false;

        // Reset iterator
        para = docx.getParagraphsIterator();

        // Init list for section content 
        for (int i = 0; i < mSectionTitles.size(); ++i)
            mSectionContent.add(i, "");

        // Get title
        if (para.hasNext())
            mediTitle = para.next().getParagraphText();
        // Get author while using "Medizinprodukt" as tag
        String prevParaText = "";
        while (para.hasNext()) {
            String paraText = para.next().getParagraphText();
            // If this word is not found, then no pseudo FI will be produced
            if (paraText.equals("Medizinprodukt") || paraText.equals("Dispositif mdical")) {
                mediPseudoTag = paraText;
                mediAuthor = prevParaText;
                break;
            }
            prevParaText = paraText;
        }

        // Get section titles + sections + ean codes
        boolean isSectionPackungen = false;
        int numSection = 0;
        // Init with section1 and title
        String sectionId_str = "";
        String sectionTitle_str = "";
        mEanCodes_str = "";
        mSectionIds_str = "section1,";
        mSectionTitles_str = mediTitle + ",";
        m_pack_info_str = "";
        // This is the EAN code pattern
        Pattern pattern = Pattern.compile("^[0-9]{13}");
        // Loop through it, identifying medication title, author, section titles and corresponding titles
        while (para.hasNext()) {
            String paraText = para.next().getParagraphText();
            if (paraText.equals(mSectionTitles.get(numSection))) {
                // ->> Get section title
                isSectionPackungen = false;
                // Get section title
                if (numSection < mSectionTitles.size())
                    numSection++;
                // Section "Packungen" is special
                if (paraText.equals("Packungen") || paraText.equals("Prsentation")) {
                    isSectionPackungen = true;
                }
                // Close previous div
                if (numSection > 1)
                    content.append("</div>");
                // Create html
                sectionId_str = "section" + (numSection + 1); // section1 is reserved for the MonTitle
                sectionTitle_str = mSectionTitles.get(numSection - 1);
                content.append("<div class=\"paragraph\" id=\"" + sectionId_str + "\">");
                content.append("<div class=\"absTitle\">" + sectionTitle_str + "</div>");
                // Generate section id string
                mSectionIds_str += (sectionId_str + ",");
                // Generate titles string
                mSectionTitles_str += (sectionTitle_str + ";");
            } else {
                // ->> Get section content
                String s = mSectionContent.get(numSection - 1);
                mSectionContent.set(numSection - 1, s + paraText + " ");
                // Create html
                content.append("<p class=\"spacing1\">" + paraText + "</p>");
                // Extract EAN codes and start positions
                Matcher matcher = pattern.matcher(paraText);
                while (matcher.find()) {
                    String eanCode = matcher.group();
                    mEanCodes_str += (eanCode + ", ");
                    if (!eanCode.isEmpty()) {
                        String pup = "";
                        String efp = "";
                        String fep = "";
                        String fap = "";
                        String vat = "";
                        String size = "";
                        String units = "";
                        String swissmedic_cat = "";
                        String pharma_code = "";
                        int visible = 0xff;
                        int has_free_samples = 0x00; // by default no free samples
                        // Exctract fep and fap pricing information
                        // FAP = Fabrikabgabepreis = EFP?
                        // FEP = Fachhandelseinkaufspreis
                        // EFP = FAP < FEP < PUP                     
                        if (m_map_products != null && eanCode != null && m_map_products.containsKey(eanCode)) {
                            Product product = m_map_products.get(eanCode);
                            if (product.efp > 0.0f)
                                efp = String.format("CHF %.2f", product.efp);
                            if (product.pp > 0.0f)
                                pup = String.format("CHF %.2f", product.pp);
                            if (product.fap > 0.0f)
                                fap = String.format("CHF %.2f", product.fap);
                            if (product.fep > 0.0f)
                                fep = String.format("CHF %.2f", product.fep);
                            if (product.vat > 0.0f)
                                vat = String.format("%.2f", product.vat);
                            if (product.size != null && !product.size.isEmpty())
                                size = product.size;
                            if (product.units != null && product.units.length > 0)
                                units = product.units[0];
                            if (product.swissmedic_cat != null && !product.swissmedic_cat.isEmpty())
                                swissmedic_cat = product.swissmedic_cat;
                            if (product.pharmacode != null && !product.pharmacode.isEmpty())
                                pharma_code = product.pharmacode;
                            visible = product.visible;
                            has_free_samples = product.free_sample;
                        }
                        m_list_of_packages.add(mediTitle.toUpperCase() + ", " + units + ", " + size + "|" + size
                                + "|" + units + "|" + efp + "|" + pup + "|" + fap + "|" + fep + "|" + vat + "|"
                                + swissmedic_cat + ",,|" + eanCode + "|" + pharma_code + "|" + visible + "|"
                                + has_free_samples + "\n");
                        // Generate bar codes
                        BarCode bc = new BarCode();
                        String barcodeImg64 = bc.encode(eanCode);
                        mBarCodes.add("<p class=\"spacing1\">" + barcodeImg64 + "</p>");
                        content.append(barcodeImg64);
                    }
                }
                // Generate section Packungen for search result
                if (isSectionPackungen)
                    m_pack_info_str += (paraText + "\n");
            }
        }
        /*
        // Add chapter "Barcodes"
        content.append("<p class=\"paragraph\"></p><div class=\"absTitle\">" + "Barcodes" + "</div>");
        for (String bcode : mBarCodes)
           content.append(bcode);
        */
        // Remove last comma from mEanCodes_str
        if (!mEanCodes_str.isEmpty())
            mEanCodes_str = mEanCodes_str.substring(0, mEanCodes_str.length() - 2);
        // Remove last \n from mSectionPackungen_str
        if (!m_pack_info_str.isEmpty())
            m_pack_info_str = m_pack_info_str.substring(0, m_pack_info_str.length() - 1);

        // Set title, autor
        mMedi.setTitle(mediTitle);
        mMedi.setAuthHolder(mediAuthor);
        mMedi.setAtcCode("PSEUDO");
        mMedi.setSubstances(mediTitle);

        System.out.println(idx + " - " + mediTitle + ": " + mEanCodes_str);

        // Close previous div + monographie div
        content.append("</div></div>");
        String title = "<div class=\"MonTitle\" id=\"section1\">" + mediTitle + "</div>";
        String author = "<div class=\"ownerCompany\"><div style=\"text-align: right;\">" + mediAuthor
                + "</div></div>";
        // Set "Medizinprodukt" label
        String pseudo = "<p class=\"spacing1\">" + mediPseudoTag + "</p>";
        // Set medi content         
        mediHtmlContent = "<html><head></head><body><div id=\"monographie\">" + title + author + pseudo
                + content.toString() + "</div></body></html>";

        // Generate clean html file
        Document doc = Jsoup.parse(mediHtmlContent);
        doc.outputSettings().escapeMode(EscapeMode.xhtml);
        doc.outputSettings().charset("UTF-8");
        doc.outputSettings().prettyPrint(true);
        doc.outputSettings().indentAmount(1);
        mediHtmlContent = doc.html();

        // Set html content
        mMedi.setContent(mediHtmlContent);

        // Add to DB
        addToDB();

        return true;
    } catch (IOException e) {
        e.printStackTrace();
        return false;
    }
}

From source file:com.min.word.core.ReadWordFileTest.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.out.println("---------------- Read File Start ------------------");
    XWPFDocument document = new XWPFDocument(new FileInputStream("test.docx"));
    XWPFWordExtractor we = new XWPFWordExtractor(document);
    System.out.println(we.getText());
    System.out.println("---------------- Read File End ------------------");
}

From source file:com.opensearchserver.extractor.parser.Docx.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws IOException {

    XWPFDocument document = new XWPFDocument(inputStream);
    XWPFWordExtractor word = null;/*from  w  w  w  .ja v a 2 s  . co m*/
    try {
        word = new XWPFWordExtractor(document);

        CoreProperties info = word.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }
        ParserDocument parserDocument = getNewParserDocument();
        parserDocument.add(CONTENT, word.getText());
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.opensearchserver.textextractor.parser.Docx.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream) throws IOException {

    XWPFDocument document = new XWPFDocument(inputStream);
    XWPFWordExtractor word = null;//w ww.ja  va  2s .c  o m
    try {
        word = new XWPFWordExtractor(document);

        CoreProperties info = word.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }
        ParserDocument parserDocument = getNewParserDocument();
        parserDocument.add(CONTENT, word.getText());
        parserDocument.add(LANG_DETECTION, languageDetection(CONTENT, 10000));
    } finally {
        IOUtils.closeQuietly(word);
    }
}

From source file:com.pdf.GetPdf.java

public static void docConvert(Document document, String url, String type)
        throws IOException, DocumentException {
    WordExtractor we;//w w  w  .j  av a2s.  c om

    if (type.equals("doc")) {
        HWPFDocument wordDoc = new HWPFDocument(new URL(url).openStream());
        we = new WordExtractor(wordDoc);
        String[] paragraphs = we.getParagraphText();
        for (int i = 0; i < paragraphs.length; i++) {
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            document.add(new Paragraph(paragraphs[i]));
        }
    } else {
        XWPFDocument wordDoc = new XWPFDocument(new URL(url).openStream());
        List<IBodyElement> contents = wordDoc.getBodyElements();

        for (IBodyElement content : contents) {
            if (content.getElementType() == BodyElementType.PARAGRAPH) {
                List<XWPFParagraph> paras = content.getBody().getParagraphs();
                for (XWPFParagraph para : paras) {
                    document.add(new Paragraph(para.getParagraphText()));
                }

            } else if (content.getElementType() == BodyElementType.TABLE) {
                List<XWPFTable> tables = content.getBody().getTables();
                for (XWPFTable table : tables) {
                    List<XWPFTableRow> rows = table.getRows();
                    for (XWPFTableRow row : rows) {
                        List<XWPFTableCell> tablecells = row.getTableCells();
                    }
                }
            }

        }
    }

}

From source file:com.project3.utils.poi.ApachePOIChecker.java

public static void checkDocument(String filename) {
    resultList = new ArrayList<ResultModel>();

    try {/* www  . j  a  v a 2 s .c o m*/
        // Open document to check
        /*
        Writer fw = new FileWriter("C:\\Users\\Noel\\Documents\\NetBeansProjects\\ApachePOITest\\test1.json"); 
        JsonObject jo = new JsonObject().add( "name", "John" ).add( "age", 23 );
                
        JsonArray ja = new JsonArray().add( "John" ).add( 23 );
        jo.writeTo(fw);
        ja.writeTo(fw);
        fw.close();
        */
        XWPFDocument docx1 = new XWPFDocument(new FileInputStream(new File(filename)));

        // Put the following to an XML file that contains strings to check with respective properties to check
        // Question 1 in Level 1
        // Initialize strings to find
        List<String> sl = new ArrayList<String>();
        String[] tl = { "Melissa Martin", "555 West Main St.", "Sampaloc, Metro Manila", "Phone: 312-312-3123",
                "E-mail: TeachMartin@email.com" };
        sl.addAll(Arrays.asList(tl));

        // Initialize properties these strings should have
        Map<String, String> properties = new HashMap<String, String>();
        properties.put("FONT FAMILY", "MV Boli");
        properties.put("FONT SIZE", "12");
        // We go through all paragraphs of the document and check for the presence of the strings
        // If they are present, check if the properties given above are present
        // Result is displayed as String = {Property1 = Score1, Property2 = Score2, ...}
        // Scores are determined by the number of elements within the paragraph which follows the given formatting
        Map<String, HashMap> results;
        results = DocumentPropertyChecker.checkRunPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("1. " + results.toString());
        addResultsToList(results, properties);

        //2
        tl = new String[] { "Summary", "Educational Background", "Related Work Experience",
                "Additional Work Experience" };
        sl.addAll(Arrays.asList(tl));

        //properties
        properties = new HashMap();
        properties.put("BOLD", "true");

        results = DocumentPropertyChecker.checkRunPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("2. " + results.toString());
        addResultsToList(results, properties);

        //3
        tl = new String[] { "Holds Bachelor's Degree in Music and Education with TEFL certification",
                "5 years experience in teaching Englsih to Spanish speaking students ages 12 and up",
                "Exceptional skills in teaching English and Spanish language",
                "Bachelor of Music; Univeristy of Sto. Tomas 2004",
                "Bachelor of Science in Education; Univerity of the Philippines 2008" };
        sl.addAll(Arrays.asList(tl));
        properties = new HashMap();
        properties.put("LINE SPACING", "1.5");

        results = DocumentPropertyChecker.checkPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("3. " + results.toString());
        addResultsToList(results, properties);

        //5
        tl = new String[] { "St. Peter's University", "2011  Present",
                "Teaches English and Spanish to students ages 15 and up",
                "Creates course materials, including exams, quizzes and visual aids used by all teachers throughout the organization",
                "Initiates programs focused in improving grammar and active listening, writing and speaking skills of students" };
        sl.addAll(Arrays.asList(tl));
        properties = new HashMap();
        properties.put("NUMBERING FORMAT", "bullet");

        results = DocumentPropertyChecker.checkPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("5. " + results.toString());
        addResultsToList(results, properties);

        //7
        properties = new HashMap();
        properties.put("MARGIN TOP", "2");
        properties.put("MARGIN BOTTOM", "2");
        properties.put("MARGIN LEFT", "2");
        properties.put("MARGIN RIGHT", "2");

        Map<String, Object> results2;
        results2 = DocumentPropertyChecker.checkPropertiesOfDocument(docx1, properties);
        HashMap<String, String> temp = new HashMap<String, String>();

        for (Entry<String, Object> r : results2.entrySet()) {
            temp.put(r.getKey(), r.getValue().toString());
        }
        System.out.println("7. " + results2.toString());
        results.clear();
        results.put("Page Format", temp);
        addResultsToList(results, properties);

        //8
        temp.clear();
        properties = new HashMap<String, String>();
        properties.put("ALIGN", "both");

        results2 = DocumentPropertyChecker.checkPropertiesOfAllParagraphs(docx1.getParagraphs(), properties);
        System.out.println("8. " + results2.toString());

        for (Entry<String, Object> r : results2.entrySet()) {
            temp.put(r.getKey(), r.getValue().toString());
        }

        results.clear();
        results.put("Page Format", temp);
        addResultsToList(results, properties);

    } catch (IOException ex) {
        Logger.getLogger(ApachePOIChecker.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.project3.utils.poiold.ApachePOIChecker.java

public static void checkDocument2(String filename) {
    try {//from  w w  w.j  av  a 2 s  .co m
        XWPFDocument docx = new XWPFDocument(new FileInputStream(new File(filename)));

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:com.project3.utils.poiold.ApachePOIChecker.java

public static void checkDocument1(String filename) {
    try {//from   w  w  w.ja  va2s  .  c  o m
        // Open document to check
        /*
        Writer fw = new FileWriter("C:\\Users\\Noel\\Documents\\NetBeansProjects\\ApachePOITest\\test1.json"); 
        JsonObject jo = new JsonObject().add( "name", "John" ).add( "age", 23 );
                
        JsonArray ja = new JsonArray().add( "John" ).add( 23 );
        jo.writeTo(fw);
        ja.writeTo(fw);
        fw.close();
        */
        XWPFDocument docx1 = new XWPFDocument(new FileInputStream(
                new File("C:\\Users\\Noel\\Documents\\NetBeansProjects\\ApachePOITest\\resume_only.docx")));

        // Put the following to an XML file that contains strings to check with respective properties to check
        // Question 1 in Level 1
        // Initialize strings to find
        ArrayList<String> sl = new ArrayList();
        String[] tl = { "Melissa Martin", "555 West Main St.", "Sampaloc, Metro Manila", "Phone: 312-312-3123",
                "E-mail: TeachMartin@email.com" };
        sl.addAll(Arrays.asList(tl));

        // Initialize properties these strings should have
        Map<String, String> properties = new HashMap();
        properties.put("FONT FAMILY", "MV Boli");
        properties.put("FONT SIZE", "12");
        // We go through all paragraphs of the document and check for the presence of the strings
        // If they are present, check if the properties given above are present
        // Result is displayed as String = {Property1 = Score1, Property2 = Score2, ...}
        // Scores are determined by the number of elements within the paragraph which follows the given formatting
        Map<String, HashMap> results;
        results = DocumentPropertyCheckerOld.checkRunPropertiesOfParagraphs(docx1.getParagraphs(), sl,
                properties);
        System.out.println("1. " + results.toString());
        System.out.println("");
        //2
        tl = new String[] { "Summary", "Educational Background", "Related Work Experience",
                "Additional Work Experience" };
        sl.addAll(Arrays.asList(tl));

        //properties
        properties = new HashMap();
        properties.put("BOLD", "true");

        results = DocumentPropertyCheckerOld.checkRunPropertiesOfParagraphs(docx1.getParagraphs(), sl,
                properties);
        System.out.println("2. " + results.toString());
        System.out.println("");
        //3
        tl = new String[] { "Holds Bachelor's Degree in Music and Education with TEFL certification",
                "5 years experience in teaching Englsih to Spanish speaking students ages 12 and up",
                "Exceptional skills in teaching English and Spanish language",
                "Bachelor of Music; Univeristy of Sto. Tomas 2004",
                "Bachelor of Science in Education; Univerity of the Philippines 2008" };
        sl.addAll(Arrays.asList(tl));
        properties = new HashMap();
        properties.put("LINE SPACING", "1.5");

        results = DocumentPropertyCheckerOld.checkPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("3. " + results.toString());
        System.out.println("");
        //4
        tl = new String[] { "2008-2011" };
        sl.addAll(Arrays.asList(tl));
        results = DocumentPropertyCheckerOld.checkIfStringExistsInParagraphs(docx1.getParagraphs(), sl);
        System.out.println("4. " + results.toString());
        System.out.println("");
        //5
        tl = new String[] { "St. Peter's University", "2011  Present",
                "Teaches English and Spanish to students ages 15 and up",
                "Creates course materials, including exams, quizzes and visual aids used by all teachers throughout the organization",
                "Initiates programs focused in improving grammar and active listening, writing and speaking skills of students" };
        sl.addAll(Arrays.asList(tl));
        properties = new HashMap();
        properties.put("NUMBERING FORMAT", "bullet");

        results = DocumentPropertyCheckerOld.checkPropertiesOfParagraphs(docx1.getParagraphs(), sl, properties);
        System.out.println("5. " + results.toString());
        System.out.println("");
        //6
        tl = new String[] { "Black Pen Movement \u00AE" };
        sl.addAll(Arrays.asList(tl));
        results = DocumentPropertyCheckerOld.checkIfStringExistsInParagraphs(docx1.getParagraphs(), sl);
        System.out.println("6. " + results.toString());
        System.out.println("");
        //7
        properties = new HashMap();
        properties.put("MARGIN TOP", "2");
        properties.put("MARGIN BOTTOM", "2");
        properties.put("MARGIN LEFT", "2");
        properties.put("MARGIN RIGHT", "2");

        System.out.println(
                "7. " + DocumentPropertyCheckerOld.checkPropertiesOfDocument(docx1, properties).toString());
        System.out.println("");
        //8
        properties = new HashMap();
        properties.put("ALIGN", "both");

        System.out.println("8. " + DocumentPropertyCheckerOld
                .checkPropertiesOfAllParagraphs(docx1.getParagraphs(), properties).toString());

    } catch (IOException ex) {
        Logger.getLogger(ApachePOIChecker.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.qubit.terra.docs.util.DocxToPdfReportConverter.java

License:Open Source License

@Override
public byte[] convert(final InputStream document) {
    try {/*  www  . jav a2  s  .com*/
        XWPFDocument xWPFDocument = new XWPFDocument(document);
        PdfOptions options = PdfOptions.getDefault();
        options.fontProvider(fontProvider);

        ByteArrayOutputStream result = new ByteArrayOutputStream();
        PdfConverter.getInstance().convert(xWPFDocument, result, options);

        return result.toByteArray();
    } catch (IOException e) {
        throw new ReportGenerationException("Error converting the report", e);
    }
}