List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument
public HWPFDocument(DirectoryNode directory) throws IOException
From source file:javaapplication1.HWPFTest.java
public static void doStuff() { String filePath = "D:\\insiders_report4.doc"; POIFSFileSystem fs = null;// w w w .j a va 2 s . c o m try { fs = new POIFSFileSystem(new FileInputStream(filePath)); HWPFDocument doc = new HWPFDocument(fs); doc = replaceText(doc, "#FIO#", " ? ?"); doc = replaceText(doc, "#BIN#", "900524300077"); doc = replaceText(doc, "#INCDAY#", "05"); doc = replaceText(doc, "#INCMONTH#", ""); doc = replaceText(doc, "#INCYEAR#", "2016"); doc = replaceText(doc, "#EXCDAY#", "05"); doc = replaceText(doc, "#EXCMONTH#", ""); doc = replaceText(doc, "#EXCYEAR#", "2016"); doc = replaceText(doc, "#MAINCHIEF#", " ?"); saveWord("D:\\result.doc", doc); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
From source file:javaapplication1.utils.MyWordToHtml.java
public static void convert(String path, String file) throws Throwable { InputStream input = new FileInputStream(path + file); HWPFDocument wordDocument = new HWPFDocument(input); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); MyPictureManager pictureManager = new MyPictureManager(); wordToHtmlConverter.setPicturesManager(pictureManager); wordToHtmlConverter.processDocument(wordDocument); List<?> pics = wordDocument.getPicturesTable().getAllPictures(); File dir = new File("D:\\pics"); dir.mkdir();//from w w w . j av a 2s .com if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); try { pic.writeImageContent(new FileOutputStream(path + "pics/" + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(outStream); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); outStream.close(); String content = new String(outStream.toByteArray()); writeFile(content, path + "result.html", "UTF-8"); }
From source file:mc.program.Importer.java
public void importDOC() { try {//from w ww . j a va2 s. c o m // Set up objects for getting from .doc file FileInputStream fis = new FileInputStream(sourceFile.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); // Extract text String[] fileData = extractor.getParagraphText(); // Put text into array list for (String fileData1 : fileData) { Scanner scanner = new Scanner(fileData1); while (scanner.hasNext()) { sourceText.add(scanner.next()); } } fis.close(); extractor.close(); } catch (Exception ex) { System.out.print(ex); } }
From source file:me.philnate.textmanager.utils.WordCount.java
License:Open Source License
/** * opens the given file, if it's a .doc or .docx file and returns the number * of words within the document/* w w w . jav a2 s . c o m*/ * * @param file * @return * @throws FileNotFoundException * @throws IOException */ public static long countFile(File file) throws FileNotFoundException, IOException { try (FileInputStream fis = new FileInputStream(file.getAbsolutePath())) { if (file.getName().endsWith(".docx")) { XWPFDocument document = new XWPFDocument(fis); XWPFWordExtractor extractor = new XWPFWordExtractor(document); return linecount(extractor.getText()); } else if (file.getName().endsWith(".doc")) { HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); return WordCount.linecount(extractor.getText()); } else { throw new IllegalArgumentException("Can't handle non doc(X) files"); } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;// www. j a v a 2s . com try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); // mj extractImageText(xhtml, document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:Modelo.EscribirWord.java
public void crearWord(Objeto obj) throws IOException { String filePath = "HojaInventarioTemplate.doc"; String filePathFinal = "HojaInventarioObjeto.doc"; POIFSFileSystem fs = null;/*from w w w . j av a2 s. c om*/ try { fs = new POIFSFileSystem(new FileInputStream(filePath)); HWPFDocument doc = new HWPFDocument(fs); doc = replaceText(doc, "$nombreObjeto", obj.getNombreObjeto()); doc = replaceText(doc, "$formaAdquisicion", obj.getFormaAdquisicion()); doc = replaceText(doc, "$fechaIngreso", obj.getFechaIngreso()); doc = replaceText(doc, "$numRegistro", obj.getNumRegistro()); doc = replaceText(doc, "$valorEconomico", obj.getValorEconomico()); doc = replaceText(doc, "$nombreFuente", obj.getNombreFuente()); doc = replaceText(doc, "$fechaInventario", obj.getFechaInventario()); doc = replaceText(doc, "$numCatalogo", obj.getNumCatalogo()); doc = replaceText(doc, "$numInventario", obj.getNumInventario()); doc = replaceText(doc, "$otrosNumeros", obj.getOtrosNumeros()); doc = replaceText(doc, "$direccionFuente", obj.getDireccionFuente()); doc = replaceText(doc, "$fechaCatalogo", obj.getFechaCatalogo()); doc = replaceText(doc, "$espesor", obj.getEspesor()); doc = replaceText(doc, "$alto", obj.getAlto()); doc = replaceText(doc, "$ancho", obj.getAncho()); doc = replaceText(doc, "$largo", obj.getLargo()); doc = replaceText(doc, "$diametro", obj.getDiametro()); doc = replaceText(doc, "$peso", obj.getPeso()); doc = replaceText(doc, "$procedencia", obj.getProcedencia()); doc = replaceText(doc, "$materiaYTecnica", obj.getMateriaYTecnica()); doc = replaceText(doc, "$numeroNegativo", obj.getNumeroNegativo()); doc = replaceText(doc, "$autor", obj.getAutor()); doc = replaceText(doc, "$epoca", obj.getEpoca()); doc = replaceText(doc, "$descripcion", obj.getDescripcion()); doc = replaceText(doc, "$documentacion", obj.getDocumentacion()); doc = replaceText(doc, "$observaciones", obj.getObservaciones()); doc = replaceText(doc, "$recibio", obj.getRecibio()); doc = replaceText(doc, "$inventario", obj.getInventario()); doc = replaceText(doc, "$catalogo", obj.getCatalogo()); doc = replaceText(doc, "$aprobo", obj.getAprobo()); saveWord(filePathFinal, doc); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
From source file:no.trank.openpipe.parse.ms.WordParser.java
License:Apache License
@Override public ParserResult parse(ParseData data) throws IOException, ParserException { final HWPFDocument doc = new HWPFDocument(data.getInputStream()); final ParserResultImpl result = new ParserResultImpl(); result.setTitle(doc.getSummaryInformation().getTitle()); final WordExtractor extractor = new WordExtractor(doc); result.setText(POIUtils.getCleanText(extractor.getText())); if (data.includeProperties()) { result.setProperties(POIUtils.getProperties(doc)); }//from w ww. j a v a 2 s . c om return result; }
From source file:orcamentotraducao.OrcamentoTraducao.java
/** * @param args the command line arguments *//*from ww w . j a va 2s .c om*/ public static void main(String[] args) { // TODO code application logic here Scanner scan = new Scanner(System.in); System.out.println("Informe o nome do arquivo:"); String filename = scan.nextLine(); String typeFile = filename.substring(filename.length() - 3, filename.length()); if (!typeFile.matches("ocx") && !typeFile.matches("doc")) { System.out.println("Este formato de arquivo no suportado\n"); System.exit(0); } try { File file = new File(filename); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); String allText = ""; int lines = 0; if (typeFile.matches("ocx")) { XWPFDocument document = new XWPFDocument(fis); List<XWPFParagraph> paragraphs = document.getParagraphs(); for (XWPFParagraph para : paragraphs) { allText += para.getText() + " "; lines++; } fis.close(); } else if (typeFile.matches("doc")) { WordExtractor extractor = new WordExtractor(new HWPFDocument(fis)); allText = extractor.getText(); } String allTextExploded[] = allText.split(" "); int words = allTextExploded.length; int characters = allText.length(); System.out.println("H " + words + " palavras"); System.out.println("H " + characters + " caracteres"); System.out.println("H " + lines + " linhas"); System.out.println("O oramento estimado de R$" + calculate(characters, words, lines)); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;/* w w w . j a v a 2 s . com*/ try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:org.docx4j.convert.in.Doc.java
License:Apache License
/** * @param in//from w ww .j a v a2 s.co m * doc file * @return new WordprocessingMLPackage containing the results of the * conversion * @throws Exception */ public static WordprocessingMLPackage convert(InputStream in) throws Exception { HWPFDocument doc = new HWPFDocument(in); WordprocessingMLPackage out = WordprocessingMLPackage.createPackage(); convert(doc, out); return out; }