List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument
public HWPFDocument(DirectoryNode directory) throws IOException
From source file:org.opf_labs.aqua.OfficeAnalyser.java
License:Apache License
public static void main(String[] args) throws Exception { //import org.apache.poi.poifs.dev.POIFSDump; //POIFSDump.main(args); SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument( SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1", "UTF-8", true); xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level SMOutputElement xmlroot = xmldoc.addElement("properties"); // Loop through arguments: for (int i = 0; i < args.length; i++) { SMOutputElement xd = xmlroot.addElement("document"); xd.addAttribute("href", args[i]); HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i])); // SummaryInformation SMOutputElement sie = xd.addElement("SummaryInformation"); sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName()); sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion()); sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor()); sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount()); sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments()); sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime()); sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat()); sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords()); sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor()); sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount()); sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber()); sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount()); sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity()); sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject()); sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate()); sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle()); sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount()); sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime()); sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted()); sie.addElement("LastSaveDateTime") .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime()); sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail()); // TextTable SMOutputElement tte = xd.addElement("TextTable"); for (TextPiece tp : doc.getTextTable().getTextPieces()) { SMOutputElement tpe = tte.addElement("TextPiece"); tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode()); tpe.addCharacters(tp.getStringBuilder().toString()); }// w w w. j ava 2 s .co m // DocumentSummaryInformation SMOutputElement dsie = xd.addElement("DocumentSummaryInformation"); dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount()); dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount()); dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount()); dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount()); dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount()); dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount()); dsie.addElement("SectionCount") .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount()); dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount()); dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat()); dsie.addElement("PresentationFormat") .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat()); dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany()); dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory()); // Sections for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; SMOutputElement se = dsie.addElement("Section"); se.addElement("FormatID").addCharacters("" + s.getFormatID()); se.addElement("CodePage").addCharacters("" + s.getCodepage()); se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount()); for (Property sp : s.getProperties()) { SMOutputElement pe = se.addElement("Property"); pe.addAttribute("class", sp.getValue().getClass().getCanonicalName()); pe.addCharacters(sp.getValue().toString()); } } SMOutputElement fte = xd.addElement("FontTable"); for (Ffn f : doc.getFontTable().getFontNames()) { SMOutputElement fe = fte.addElement("Font"); fe.addElement("MainFontName").addCharacters(f.getMainFontName()); try { fe.addElement("AltFontName").addCharacters(f.getAltFontName()); } catch (Exception e) { // Seems to fail, and no safe test found as yet. } fe.addElement("Size").addCharacters("" + f.getSize()); fe.addElement("Weight").addCharacters("" + f.getWeight()); } SMOutputElement pte = xd.addElement("PicturesTable"); for (Picture p : doc.getPicturesTable().getAllPictures()) { SMOutputElement pe = pte.addElement("Picture"); pe.addElement("MimeType").addCharacters(p.getMimeType()); pe.addElement("Width").addCharacters("" + p.getWidth()); pe.addElement("Height").addCharacters("" + p.getHeight()); pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor()); pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor()); pe.addElement("Content").addCharacters("" + p.getContent()); } //parseCompObj( new File(args[i]) ); // This //System.out.println("Dumping " + args[i]); FileInputStream is = new FileInputStream(args[i]); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); DirectoryEntry root = fs.getRoot(); //dump(root); xmldoc.closeRoot(); // important, flushes, closes output } }
From source file:org.paxle.parser.msoffice.impl.MsWordParser.java
License:Open Source License
@Override protected void extractText(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException, IOException { // extract plain text final HWPFDocument doc = new HWPFDocument(fs); final Range r = doc.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { // get next paragraph final Paragraph p = r.getParagraph(i); // append paragraph text parserDoc.append(p.text());/* w w w .j ava2s . c om*/ // we know that this is the end of a block of text, so we can include a separator parserDoc.append(' '); } }
From source file:org.sakaiproject.contentreview.impl.compilatio.CompilatioContentValidator.java
License:Educational Community License
private int wordDocLength(ContentResource resource) { if (!serverConfigurationService.getBoolean("tii.checkWordLength", false)) return 100; try {/*from w ww.j a v a2 s . c o m*/ POIFSFileSystem pfs = new POIFSFileSystem(resource.streamContent()); HWPFDocument doc = new HWPFDocument(pfs); SummaryInformation dsi = doc.getSummaryInformation(); int count = dsi.getWordCount(); log.debug("got a count of " + count); //if this == 0 then its likely that something went wrong -poi couldn't read it if (count == 0) return 100; return count; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ServerOverloadException e) { // TODO Auto-generated catch block e.printStackTrace(); } //in case we can't read this lets err on the side of caution return 100; }
From source file:org.shareok.data.documentProcessor.WordHandler.java
private String[] readDocFile(FileInputStream fs) throws IOException { String[] paragraphs = null;/* w ww . j a v a 2 s . com*/ try { HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); paragraphs = we.getParagraphText(); } catch (Exception e) { e.printStackTrace(); } finally { fs.close(); } return paragraphs; }
From source file:org.sleuthkit.autopsy.imageExtractor.ImageExtractor.java
private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) { // TODO check for BBArtifact ENCRYPTION_DETECTED? Might be detected elsewhere...? List<ExtractedImage> listOfExtractedImages = new ArrayList<ExtractedImage>(); String parentFileName = getUniqueName(af); HWPFDocument docA = null;// w w w . j av a 2 s. c o m try { docA = new HWPFDocument(new ReadContentInputStream(af)); } catch (IOException ex) { logger.log(Level.WARNING, "HWPFDocument container could not be instantiated while reading " + af.getName(), ex); return null; } PicturesTable pictureTable = docA.getPicturesTable(); List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = pictureTable.getAllPictures(); String outputFolderPath; if (listOfAllPictures.isEmpty()) { return null; } else { outputFolderPath = getOutputFolderPath(parentFileName); } if (outputFolderPath == null) { logger.log(Level.WARNING, "Could not get path for image extraction from AbstractFile: {0}", af.getName()); return null; } for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) { FileOutputStream fos = null; String fileName = picture.suggestFullFileName(); try { fos = new FileOutputStream(outputFolderPath + File.separator + fileName); } catch (FileNotFoundException ex) { logger.log(Level.WARNING, "Invalid path provided for image extraction", ex); continue; } try { fos.write(picture.getContent()); fos.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Could not write to the provided location", ex); continue; } // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime String fileRelativePath = File.separator + moduleDirRelative + File.separator + parentFileName + File.separator + fileName; long size = picture.getSize(); ExtractedImage extractedimage = new ExtractedImage(fileName, fileRelativePath, size, af); listOfExtractedImages.add(extractedimage); } return listOfExtractedImages; }
From source file:org.sleuthkit.autopsy.modules.embeddedfileextractor.ImageExtractor.java
License:Open Source License
/** * Extract images from doc format files. * * @param af the file from which images are to be extracted. * * @return list of extracted images. Returns null in case no images were * extracted./*from w w w .j ava 2 s. c o m*/ */ private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) { List<ExtractedImage> listOfExtractedImages; HWPFDocument doc = null; try { doc = new HWPFDocument(new ReadContentInputStream(af)); } catch (Throwable ex) { // instantiating POI containers throw RuntimeExceptions logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex); //NON-NLS return null; } PicturesTable pictureTable = null; List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = null; try { pictureTable = doc.getPicturesTable(); listOfAllPictures = pictureTable.getAllPictures(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } String outputFolderPath; if (listOfAllPictures.isEmpty()) { return null; } else { outputFolderPath = getOutputFolderPath(this.parentFileName); } if (outputFolderPath == null) { return null; } listOfExtractedImages = new ArrayList<>(); byte[] data = null; for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) { String fileName = picture.suggestFullFileName(); try { data = picture.getContent(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data); // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime listOfExtractedImages .add(new ExtractedImage(fileName, getFileRelativePath(fileName), picture.getSize(), af)); } return listOfExtractedImages; }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
public static String getWordTextOld(InputStream is) { try {//from w w w .ja v a 2 s. co m return getWordTextOld(new HWPFDocument(is)); } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:poi.hslf.examples.DataExtraction.java
License:Apache License
public static void main(String args[]) throws Exception { if (args.length == 0) { usage();/* w w w . java 2s .c om*/ return; } FileInputStream is = new FileInputStream(args[0]); SlideShow ppt = new SlideShow(is); is.close(); //extract all sound files embedded in this presentation SoundData[] sound = ppt.getSoundData(); for (int i = 0; i < sound.length; i++) { String type = sound[i].getSoundType(); //*.wav String name = sound[i].getSoundName(); //typically file name byte[] data = sound[i].getData(); //raw bytes //save the sound on disk FileOutputStream out = new FileOutputStream(name + type); out.write(data); out.close(); } //extract embedded OLE documents Slide[] slide = ppt.getSlides(); for (int i = 0; i < slide.length; i++) { Shape[] shape = slide[i].getShapes(); for (int j = 0; j < shape.length; j++) { if (shape[j] instanceof OLEShape) { OLEShape ole = (OLEShape) shape[j]; ObjectData data = ole.getObjectData(); String name = ole.getInstanceName(); if ("Worksheet".equals(name)) { //read xls HSSFWorkbook wb = new HSSFWorkbook(data.getData()); } else if ("Document".equals(name)) { HWPFDocument doc = new HWPFDocument(data.getData()); //read the word document Range r = doc.getRange(); for (int k = 0; k < r.numParagraphs(); k++) { Paragraph p = r.getParagraph(k); System.out.println(p.text()); } //save on disk FileOutputStream out = new FileOutputStream(name + "-(" + (j) + ").doc"); doc.write(out); out.close(); } else { FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (j + 1) + ".dat"); InputStream dis = data.getData(); byte[] chunk = new byte[2048]; int count; while ((count = dis.read(chunk)) >= 0) { out.write(chunk, 0, count); } is.close(); out.close(); } } } } //Pictures for (int i = 0; i < slide.length; i++) { Shape[] shape = slide[i].getShapes(); for (int j = 0; j < shape.length; j++) { if (shape[j] instanceof Picture) { Picture p = (Picture) shape[j]; PictureData data = p.getPictureData(); String name = p.getPictureName(); int type = data.getType(); String ext; switch (type) { case Picture.JPEG: ext = ".jpg"; break; case Picture.PNG: ext = ".png"; break; case Picture.WMF: ext = ".wmf"; break; case Picture.EMF: ext = ".emf"; break; case Picture.PICT: ext = ".pict"; break; case Picture.DIB: ext = ".dib"; break; default: continue; } FileOutputStream out = new FileOutputStream("pict-" + j + ext); out.write(data.getData()); out.close(); } } } }
From source file:poi.hssf.usermodel.examples.EmeddedObjects.java
License:Apache License
public static void main(String[] args) throws Exception { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(args[0])); HSSFWorkbook workbook = new HSSFWorkbook(fs); for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) { //the OLE2 Class Name of the object String oleName = obj.getOLE2ClassName(); if (oleName.equals("Worksheet")) { DirectoryNode dn = (DirectoryNode) obj.getDirectory(); HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false); //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets()); } else if (oleName.equals("Document")) { DirectoryNode dn = (DirectoryNode) obj.getDirectory(); HWPFDocument embeddedWordDocument = new HWPFDocument(dn); //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text()); } else if (oleName.equals("Presentation")) { DirectoryNode dn = (DirectoryNode) obj.getDirectory(); SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn)); //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length); } else {/*from ww w . j a v a 2s . co m*/ if (obj.hasDirectoryEntry()) { // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is DirectoryNode dn = (DirectoryNode) obj.getDirectory(); for (Iterator entries = dn.getEntries(); entries.hasNext();) { Entry entry = (Entry) entries.next(); //System.out.println(oleName + "." + entry.getName()); } } else { // There is no DirectoryEntry // Recover the object's data from the HSSFObjectData instance. byte[] objectData = obj.getObjectData(); } } } }
From source file:poi.hwpf.Word2Forrest.java
License:Apache License
public static void main(String[] args) { try {// www .j a va 2 s . c o m OutputStream out = new FileOutputStream("c:\\test.xml"); new Word2Forrest(new HWPFDocument(new FileInputStream(args[0])), out); out.close(); } catch (Throwable t) { t.printStackTrace(); } }