Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException 

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:edu.ur.ir.index.DefaultWordTextExtractor.java

License:Apache License

/**
 * Extract text from a word 97-2003 document.
 * @throws Exception //from  ww w.  j  a v  a  2 s  .  c om
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {

    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    FileInputStream inputStream = null;
    try {
        inputStream = new FileInputStream(f);
        HWPFDocument wordDocument = new HWPFDocument(inputStream);
        WordExtractor wordExtractor = new WordExtractor(wordDocument);

        String myText = wordExtractor.getText();
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }
    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for word document " + f.getAbsolutePath(), e);
        throw (e);
    }

    finally {
        closeInputStream(inputStream);
    }
    return text;
}

From source file:eu.modelwriter.ide.ui.command.ExtractTextHandler.java

License:Open Source License

/**
 * Extracts text from the given .doc {@link IFile}.
 * /* ww  w  .  j av a  2 s  .c o m*/
 * @param file
 *            the .doc {@link IFile}
 */
private void exctractDoc(final IFile file) {
    try {
        FileInputStream fis = new FileInputStream(file.getLocation().toFile());
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor we = new WordExtractor(doc);
        final IPath textPath = file.getFullPath().removeFileExtension().addFileExtension("txt");
        final IFile textFile = ResourcesPlugin.getWorkspace().getRoot().getFile(textPath);
        if (textFile.exists()) {
            textFile.delete(true, new NullProgressMonitor());
        }
        textFile.create(new ByteArrayInputStream(we.getText().getBytes()), true, new NullProgressMonitor());
        we.close();
        fis.close();
    } catch (IOException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    } catch (CoreException e) {
        Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID,
                UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e));
    }
}

From source file:File.DOC.ReadDoc.java

public void Read(String path, String namafile) {
    try {// w  w w . jav a 2 s  .  c  o m
        File file = new File(path + namafile + ".doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        HWPFDocument doc = new HWPFDocument(fis);

        WordExtractor we = new WordExtractor(doc);

        String[] paragraphs = we.getParagraphText();

        System.out.println("Total no of paragraph " + paragraphs.length);
        for (String para : paragraphs) {
            System.out.println(para.toString());
        }
        fis.close();
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:File.DOC.WriteDoc.java

/**
 * @param args the command line arguments
 *///w w w.j a va2  s. c  om
public void Write(String path, String namafile, String content) {
    File file = new File("D:\\xyz.doc");
    try {
        POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file));
        HWPFDocument doc = new HWPFDocument(fs);
        Range range = doc.getRange();
        CharacterRun run = range.insertBefore(content.replace("\n", "\013"));
        run.setBold(true);
        OutputStream outa = new FileOutputStream(new File(path + namafile + ".doc"));
        doc.write(outa);
        out.close();
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:FileParser.Parser.java

public ArrayList<String> fileParser() throws IOException, SAXException {
    String path = file.getPath();
    String[] getArray = fileChooser.getFileType();
    String type = getArray[1];// w  w w. j  a  va 2 s . c  o  m
    //System.out.println("Type: "+type);
    String fileName = getArray[0];
    String fileContent = "";
    file = fileChooser.getFile();
    getParsedData = new ArrayList<>();

    switch (type) {
    case "txt":

        try {
            FileReader contentReader = new FileReader(file.getPath());
            bReader = new BufferedReader(contentReader);
            while ((fileContent = bReader.readLine()) != null) {
                getParsedData.add(fileContent);
            }
        }

        catch (FileNotFoundException ex) {
            Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
        }

        break;

    case "html":
        // https://scholar.google.com.tr/scholar?hl=tr&q=ecir+u%C4%9Fur+k%C3%BC%C3%A7%C3%BCksille&btnG=&lr=
        String url = "";
        try {
            if (url.isEmpty()) {

                Document doc = Jsoup.parse(file, null);
                fileContent = doc.text();
                getParsedData.add(fileContent);

            } else {
                Document doc = Jsoup.connect(url).get();
                Elements elements = doc.select("div.gs_r");
                for (Element div : elements) {
                    fileContent += div.text();
                }

                getParsedData.add(fileContent);

            }

        } catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "pdf":
        try

        {
            inputStream = new FileInputStream(file);
            parser = new PDFParser(inputStream);
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(2);
            fileContent = pdfStripper.getText(pdDoc);
            getParsedData.add(fileContent);

        }

        catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "doc":
        try {

            FileInputStream fis = new FileInputStream(file.getAbsolutePath());
            HWPFDocument document = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(document);
            fileContent = extractor.getText();
            getParsedData.add(fileContent);

        } catch (Exception e) {
            e.printStackTrace();
        }

        break;

    case "xml":
        /*
                
        parsing xml file path
                
        /home/burakcan/Desktop/eurofxref.xml 
        */

        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            Document doc = dBuilder.parse(file);
            doc.getDocumentElement().normalize();
            Element firstCube = (Element) doc.getElementsByTagName("Cube").item(0);
            Element secondCube = (Element) firstCube.getElementsByTagName("Cube").item(0);

            NodeList nList = doc.getElementsByTagName("Cube");
            for (int i = 0; i < nList.getLength(); i++) {
                Node nNode = nList.item(i);
                Element eElement = (Element) nNode;
                getParsedData.add(eElement.getAttribute("currency") + " " + eElement.getAttribute("rate"));

            }

        }

        catch (ParserConfigurationException ex) {
            Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
        }

        break;

    default:

        JOptionPane.showMessageDialog(null, "This program can not parse your choice!", "Program Error",
                JOptionPane.ERROR_MESSAGE);
        System.exit(0);
    }

    return getParsedData;

}

From source file:graph.readfile.ReadFileDOC.java

/**
 * @param path menyimpan destination file tersebut berada
 * @return array string/*  w w w .  j  av  a 2  s . c  o  m*/
 * @throws java.io.IOException
 */
protected String GetDataFromFile(String path) throws IOException {
    file = new File(path);
    fis = new FileInputStream(file);
    document = new HWPFDocument(fis);
    extractor = new WordExtractor(document);
    String fileData = extractor.getText();

    return fileData;
}

From source file:graph.readfile.ReadFileDOC.java

/**
 * @param path menyimpan destination file tersebut berada
 * @return array string//from w  w  w.j  a v  a  2 s .  c om
 * @throws java.io.IOException
 * Fungsi ini akan mengembalikan Array dimana array tersebut menyimpan kalimat per satu baris
 */
protected String[] GetDataFromFilePerBaris(String path) throws IOException {
    file = new File(path);
    fis = new FileInputStream(file);
    document = new HWPFDocument(fis);
    extractor = new WordExtractor(document);
    String[] fileData = extractor.getParagraphText();

    return fileData;
}

From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java

public static String readfromdoc(String datsetspath, String Document) {
    File file = null;/*  w w w  .j av a 2s. com*/
    WordExtractor extractor = null;
    String extractedtext = "";
    try {

        file = new File(datsetspath + Document);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument document = new HWPFDocument(fis);
        extractor = new WordExtractor(document);
        String[] fileData = extractor.getParagraphText();
        for (int i = 0; i < fileData.length; i++) {
            if (fileData[i] != null)
                //                       System.out.print("{\"text\":\"");
                System.out.print(fileData[i].replace("\n", "").replace("\r", ""));
            extractedtext += fileData[i].replace("\n", "").replace("\r", "");
            //                System.out.print("\"}");

        }
    } catch (Exception exep) {
        exep.printStackTrace();
    }
    return extractedtext;
}

From source file:insight.masters.policyanalytics.services.PolicyAnalytics.java

public static String readfromdoc(String datsetspath, String Document) {
    File file = null;//from   w  w  w  . j  a v a2s.c om
    WordExtractor extractor = null;
    String extractedtext = "";
    try {

        file = new File(datsetspath + Document);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument document = new HWPFDocument(fis);
        extractor = new WordExtractor(document);
        String[] fileData = extractor.getParagraphText();
        for (int i = 0; i < fileData.length; i++) {
            if (fileData[i] != null)
                //                    System.out.print("{\"text\":\"");
                System.out.print(fileData[i].replace("\n", "").replace("\r", ""));
            extractedtext += fileData[i].replace("\n", "").replace("\r", "");
            //             System.out.print("\"}");

        }
    } catch (Exception exep) {
        exep.printStackTrace();
    }
    return extractedtext;
}

From source file:intelligentWebAlgorithms.util.parsing.msword.MSWordDocumentParser.java

License:Apache License

public HWPFDocument poiReadDocument(String fileName) {

    POIFSFileSystem fs = null;/*from   w w  w. j a v a 2  s . c  om*/
    HWPFDocument hwpfDoc = null;
    try {
        fs = new POIFSFileSystem(new FileInputStream(fileName));
        hwpfDoc = new HWPFDocument(fs);

        /** Read the content **/

        String text = hwpfDoc.getDocumentText();

        wordDoc.setDocumentTitle(getTitle(text));
        wordDoc.setText(text);
        wordDoc.setContent(text);

        //          P.hline();
        //          P.println(getTitle(text));
        //          printProperties(hwpfDoc.getDocProperties());

        // readParagraphs(hwpfDoc);

    } catch (Exception e) {
        e.printStackTrace();
    }
    return hwpfDoc;
}