List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument
public HWPFDocument(DirectoryNode directory) throws IOException
From source file:edu.ur.ir.index.DefaultWordTextExtractor.java
License:Apache License
/** * Extract text from a word 97-2003 document. * @throws Exception //from ww w. j a v a 2 s . c om * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } FileInputStream inputStream = null; try { inputStream = new FileInputStream(f); HWPFDocument wordDocument = new HWPFDocument(inputStream); WordExtractor wordExtractor = new WordExtractor(wordDocument); String myText = wordExtractor.getText(); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for word document " + f.getAbsolutePath(), e); throw (e); } finally { closeInputStream(inputStream); } return text; }
From source file:eu.modelwriter.ide.ui.command.ExtractTextHandler.java
License:Open Source License
/** * Extracts text from the given .doc {@link IFile}. * /* ww w . j av a 2 s .c o m*/ * @param file * the .doc {@link IFile} */ private void exctractDoc(final IFile file) { try { FileInputStream fis = new FileInputStream(file.getLocation().toFile()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor we = new WordExtractor(doc); final IPath textPath = file.getFullPath().removeFileExtension().addFileExtension("txt"); final IFile textFile = ResourcesPlugin.getWorkspace().getRoot().getFile(textPath); if (textFile.exists()) { textFile.delete(true, new NullProgressMonitor()); } textFile.create(new ByteArrayInputStream(we.getText().getBytes()), true, new NullProgressMonitor()); we.close(); fis.close(); } catch (IOException e) { Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID, UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e)); } catch (CoreException e) { Activator.getDefault().getLog().log(new Status(IStatus.ERROR, Activator.PLUGIN_ID, UNABLE_TO_EXTRACT_TEXT_FROM + file.getFullPath(), e)); } }
From source file:File.DOC.ReadDoc.java
public void Read(String path, String namafile) { try {// w w w . jav a 2 s . c o m File file = new File(path + namafile + ".doc"); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(fis); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); System.out.println("Total no of paragraph " + paragraphs.length); for (String para : paragraphs) { System.out.println(para.toString()); } fis.close(); } catch (Exception ex) { ex.printStackTrace(); } }
From source file:File.DOC.WriteDoc.java
/** * @param args the command line arguments *///w w w.j a va2 s. c om public void Write(String path, String namafile, String content) { File file = new File("D:\\xyz.doc"); try { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file)); HWPFDocument doc = new HWPFDocument(fs); Range range = doc.getRange(); CharacterRun run = range.insertBefore(content.replace("\n", "\013")); run.setBold(true); OutputStream outa = new FileOutputStream(new File(path + namafile + ".doc")); doc.write(outa); out.close(); } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:FileParser.Parser.java
public ArrayList<String> fileParser() throws IOException, SAXException { String path = file.getPath(); String[] getArray = fileChooser.getFileType(); String type = getArray[1];// w w w. j a va 2 s . c o m //System.out.println("Type: "+type); String fileName = getArray[0]; String fileContent = ""; file = fileChooser.getFile(); getParsedData = new ArrayList<>(); switch (type) { case "txt": try { FileReader contentReader = new FileReader(file.getPath()); bReader = new BufferedReader(contentReader); while ((fileContent = bReader.readLine()) != null) { getParsedData.add(fileContent); } } catch (FileNotFoundException ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } break; case "html": // https://scholar.google.com.tr/scholar?hl=tr&q=ecir+u%C4%9Fur+k%C3%BC%C3%A7%C3%BCksille&btnG=&lr= String url = ""; try { if (url.isEmpty()) { Document doc = Jsoup.parse(file, null); fileContent = doc.text(); getParsedData.add(fileContent); } else { Document doc = Jsoup.connect(url).get(); Elements elements = doc.select("div.gs_r"); for (Element div : elements) { fileContent += div.text(); } getParsedData.add(fileContent); } } catch (Exception e) { e.printStackTrace(); } break; case "pdf": try { inputStream = new FileInputStream(file); parser = new PDFParser(inputStream); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(2); fileContent = pdfStripper.getText(pdDoc); getParsedData.add(fileContent); } catch (Exception e) { e.printStackTrace(); } break; case "doc": try { FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); fileContent = extractor.getText(); getParsedData.add(fileContent); } catch (Exception e) { e.printStackTrace(); } break; case "xml": /* parsing xml file path /home/burakcan/Desktop/eurofxref.xml */ try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(file); doc.getDocumentElement().normalize(); Element firstCube = (Element) doc.getElementsByTagName("Cube").item(0); Element secondCube = (Element) firstCube.getElementsByTagName("Cube").item(0); NodeList nList = doc.getElementsByTagName("Cube"); for (int i = 0; i < nList.getLength(); i++) { Node nNode = nList.item(i); Element eElement = (Element) nNode; getParsedData.add(eElement.getAttribute("currency") + " " + eElement.getAttribute("rate")); } } catch (ParserConfigurationException ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } break; default: JOptionPane.showMessageDialog(null, "This program can not parse your choice!", "Program Error", JOptionPane.ERROR_MESSAGE); System.exit(0); } return getParsedData; }
From source file:graph.readfile.ReadFileDOC.java
/** * @param path menyimpan destination file tersebut berada * @return array string/* w w w . j av a 2 s . c o m*/ * @throws java.io.IOException */ protected String GetDataFromFile(String path) throws IOException { file = new File(path); fis = new FileInputStream(file); document = new HWPFDocument(fis); extractor = new WordExtractor(document); String fileData = extractor.getText(); return fileData; }
From source file:graph.readfile.ReadFileDOC.java
/** * @param path menyimpan destination file tersebut berada * @return array string//from w w w.j a v a 2 s . c om * @throws java.io.IOException * Fungsi ini akan mengembalikan Array dimana array tersebut menyimpan kalimat per satu baris */ protected String[] GetDataFromFilePerBaris(String path) throws IOException { file = new File(path); fis = new FileInputStream(file); document = new HWPFDocument(fis); extractor = new WordExtractor(document); String[] fileData = extractor.getParagraphText(); return fileData; }
From source file:insight.masters.policyanalytics.services.BranchingOriginStanfordKeywords.java
public static String readfromdoc(String datsetspath, String Document) { File file = null;/* w w w .j av a 2s. com*/ WordExtractor extractor = null; String extractedtext = ""; try { file = new File(datsetspath + Document); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); extractor = new WordExtractor(document); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) { if (fileData[i] != null) // System.out.print("{\"text\":\""); System.out.print(fileData[i].replace("\n", "").replace("\r", "")); extractedtext += fileData[i].replace("\n", "").replace("\r", ""); // System.out.print("\"}"); } } catch (Exception exep) { exep.printStackTrace(); } return extractedtext; }
From source file:insight.masters.policyanalytics.services.PolicyAnalytics.java
public static String readfromdoc(String datsetspath, String Document) { File file = null;//from w w w . j a v a2s.c om WordExtractor extractor = null; String extractedtext = ""; try { file = new File(datsetspath + Document); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); extractor = new WordExtractor(document); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) { if (fileData[i] != null) // System.out.print("{\"text\":\""); System.out.print(fileData[i].replace("\n", "").replace("\r", "")); extractedtext += fileData[i].replace("\n", "").replace("\r", ""); // System.out.print("\"}"); } } catch (Exception exep) { exep.printStackTrace(); } return extractedtext; }
From source file:intelligentWebAlgorithms.util.parsing.msword.MSWordDocumentParser.java
License:Apache License
public HWPFDocument poiReadDocument(String fileName) { POIFSFileSystem fs = null;/*from w w w. j a v a 2 s . c om*/ HWPFDocument hwpfDoc = null; try { fs = new POIFSFileSystem(new FileInputStream(fileName)); hwpfDoc = new HWPFDocument(fs); /** Read the content **/ String text = hwpfDoc.getDocumentText(); wordDoc.setDocumentTitle(getTitle(text)); wordDoc.setText(text); wordDoc.setContent(text); // P.hline(); // P.println(getTitle(text)); // printProperties(hwpfDoc.getDocProperties()); // readParagraphs(hwpfDoc); } catch (Exception e) { e.printStackTrace(); } return hwpfDoc; }