Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException 

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:com.unsa.view.MainView.java

License:Creative Commons License

private void DocConverterPDF(File file1) {
    NPOIFSFileSystem fs = null;/*from w  w w.  j av a2  s  .  co m*/
    com.lowagie.text.Document document = new com.lowagie.text.Document();

    try {
        System.out.println(file1.getAbsolutePath());
        fs = new NPOIFSFileSystem(new FileInputStream(file1.getAbsolutePath()));
        HWPFDocument doc = new HWPFDocument(fs.getRoot());
        WordExtractor we = new WordExtractor(doc);
        String output = file1.getAbsolutePath().substring(0, file1.getAbsolutePath().length() - 3);
        OutputStream fileout = new FileOutputStream(new File(output + "pdf"));

        PdfWriter writer = PdfWriter.getInstance(document, fileout);

        Range range = doc.getRange();
        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();
        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            document.add(new Paragraph(paragraphs[i]));
        }

    } catch (Exception e) {

        e.printStackTrace();
    } finally {

        document.close();
    }

}

From source file:com.virtusa.isq.vtaf.runtime.SeleniumTestBase.java

License:Apache License

/**
 * Read doc file./*w  w w .j a v a 2s. c  o m*/
 *
 * @param fileName
 *            the file name
 * @return the string
 */
public final String readDocFile(final String fileName) {
    String docContent = null;
    try {
        File file = new File(fileName);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor we = new WordExtractor(doc);

        docContent = we.getText();
        System.out.println("MS Word(.doc) Document Red, Content:" + docContent);

        fis.close();
    } catch (IOException e) {
        e.printStackTrace();
        reportresult(true, "CHECK DOCUMENT :", "FAILED",
                "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage());
        checkTrue(false, false,
                "CheckDocument command NODECOUNT : Execption occured. Actual error : " + e.getMessage());
    }
    return docContent;

}

From source file:com.weibo.datasys.parser.office.extractor.WordParse.java

License:Open Source License

public FileData extractorDoc(File filePath) {
    FileData fData = new FileData();
    fData.setName(filePath.getName());//from   ww w.j ava2s .c  om
    HWPFDocument doc = null;
    try {
        doc = new HWPFDocument(new FileInputStream(filePath));
        fData.setContent(doc.getRange().text());
    } catch (Exception e) {
        LOG.error("", e);
    }
    return fData;
}

From source file:com.xpn.xwiki.plugin.lucene.textextraction.MSWordTextExtractor.java

License:Apache License

public String getText(byte[] data) throws Exception {
    HWPFDocument wordDoc = new HWPFDocument(new ByteArrayInputStream(data));
    Range range = wordDoc.getRange();/*from   ww w .  j  a  v a2  s .c  om*/
    return range.text();
}

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

/**
 * Create a new Word Extractor//from  www . j a  va 2s.co m
 * @param fs POIFSFileSystem containing the word file
 */
public WordExtractor(POIFSFileSystem fs) throws IOException {
    this(new HWPFDocument(fs));
}

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

public String extractText(POIFSFileSystem poifs) throws Exception {
    this.doc = new HWPFDocument(poifs);
    return getText();
}

From source file:com.zhch.example.poi.Word2Forrest.java

License:Apache License

public static void main(String[] args) throws IOException {
    InputStream is = new FileInputStream("d:\\temp\\d3\\luyang.doc");
    OutputStream out = new FileOutputStream("d:\\temp\\d3\\test.xml");
    try {//from ww  w. j a  v  a2 s. c o m
        new Word2Forrest(new HWPFDocument(is), out);
    } finally {
        out.close();
        is.close();
    }
    System.out.println("over!!");
}

From source file:cv_extractor.DocReader.java

protected static void readDocFile(File localFile) {
    try {/* www.  j  av a  2  s. co  m*/
        //Create a input stream to read file
        FileInputStream fis = new FileInputStream(localFile.getAbsolutePath());

        //For reading docx files
        HWPFDocument doc = new HWPFDocument(fis);

        WordExtractor we = new WordExtractor(doc);

        String[] paragraphs = we.getParagraphText();

        System.out.println("Total no of paragraph " + paragraphs.length);

        for (String para : paragraphs) {
            //Compile the regex defined above
            Pattern r = Pattern.compile(pattern);

            //Check if any string matches the compiled pattern
            Matcher m = r.matcher(para);

            if (m.find()) {
                //m.group() Returns the input subsequence matched by the previous match
                data.add(m.group());
            }
        }

        fis.close();
    }

    catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:de.uni_siegen.wineme.come_in.thumbnailer.util.mime.DocFileIdentifier.java

License:Open Source License

@Override
public String identify(String mimeType, byte[] bytes, File file) {

    if (isOfficeFile(mimeType) && !DOC_MIME_TYPE.equals(mimeType)) {
        try {/*from  w w  w. ja  v a2  s  . co m*/
            FileInputStream stream = new FileInputStream(file);
            HWPFDocument document = new HWPFDocument(stream);

            if (document.getRange().getEndOffset() > 0) {
                return DOC_MIME_TYPE;
            }
        } catch (Throwable e) {
        }
    }

    return mimeType;
}

From source file:edu.temple.CIS3238.readEasyUI.java

private static String[] readMyDocument(String fileName) {
    POIFSFileSystem fs = null;/*from  w ww  .ja v a  2s . co  m*/
    String text = null;
    String strArray[] = null;
    try {
        fs = new POIFSFileSystem(new FileInputStream(fileName));
        HWPFDocument doc = new HWPFDocument(fs);

        /** Read the content **/
        text = readParagraphs(doc);

        int pageNumber = 1;

        /** We will try reading the header for page 1**/
        //readHeader(doc, pageNumber);

        /** Let's try reading the footer for page 1**/
        //readFooter(doc, pageNumber);

        /** Read the document summary**/
        //readDocumentSummary(doc);

        strArray = text.split(" ");

    } catch (Exception e) {
        e.printStackTrace();
    }

    return strArray;
}