List of usage examples for org.apache.poi.hwpf HWPFDocument getPicturesTable
public PicturesTable getPicturesTable()
From source file:javaapplication1.utils.MyWordToHtml.java
public static void convert(String path, String file) throws Throwable { InputStream input = new FileInputStream(path + file); HWPFDocument wordDocument = new HWPFDocument(input); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); MyPictureManager pictureManager = new MyPictureManager(); wordToHtmlConverter.setPicturesManager(pictureManager); wordToHtmlConverter.processDocument(wordDocument); List<?> pics = wordDocument.getPicturesTable().getAllPictures(); File dir = new File("D:\\pics"); dir.mkdir();// w ww . ja v a 2 s . c o m if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); try { pic.writeImageContent(new FileOutputStream(path + "pics/" + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream outStream = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(outStream); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); outStream.close(); String content = new String(outStream.toByteArray()); writeFile(content, path + "result.html", "UTF-8"); }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void extractImageText(XHTMLContentHandler xhtml, HWPFDocument document) { if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) { TikaImageHelper helper = new TikaImageHelper(metadata); try {//from w ww. ja va2s. c o m List<Picture> pictures2 = document.getPicturesTable().getAllPictures(); for (Picture picture : pictures2) { ByteArrayInputStream imageData = new ByteArrayInputStream(picture.getContent()); helper.addImage(ImageIO.read(imageData)); } // TODO: find out page number helper.addTextToHandler(xhtml); } catch (Exception e) { e.printStackTrace(); } finally { if (helper != null) { helper.close(); } } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document; try {//from www. j av a 2 s . com document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); // mj extractImageText(xhtml, document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document; try {//from www . j a v a 2 s .c o m document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:org.docx4j.convert.in.Doc.java
License:Apache License
private static org.docx4j.wml.P handleP(WordprocessingMLPackage wordMLPackage, HWPFDocument doc, Paragraph p, org.apache.poi.hwpf.model.StyleSheet stylesheet, MainDocumentPart documentPart, org.docx4j.wml.ObjectFactory factory) { org.docx4j.wml.P wmlP = null;/* w w w .java2 s .c om*/ if (p.getStyleIndex() > 0) { log.debug("Styled paragraph, with index: " + p.getStyleIndex()); String styleName = stylesheet.getStyleDescription(p.getStyleIndex()).getName(); log.debug(styleName); wmlP = documentPart.createStyledParagraphOfText(stripSpace(styleName), null); } else { wmlP = documentPart.createParagraphOfText(null); } // LineSpacingDescriptor lsd = p.getLineSpacing(); // if (lsd==null || lsd.isEmpty()) { // // do nothing // } else { // PPr pPr = wmlP.getPPr(); // if (pPr==null) { // pPr = Context.getWmlObjectFactory().createPPr(); // wmlP.setPPr(pPr); // } // Spacing spacing = // Context.getWmlObjectFactory().createPPrBaseSpacing(); // spacing.setLine(lsd._dyaLine); // not visible // spacing.setLineRule(STLineSpacingRule.AUTO); // pPr.setSpacing(spacing); // } for (int z = 0; z < p.numCharacterRuns(); z++) { // character run CharacterRun run = p.getCharacterRun(z); // No character styles defined in there?? org.docx4j.wml.RPr rPr = null; if (run.isBold()) { // TODO - HIGH PRIORITY- handle other run properties // esp underline, font size if (rPr == null) { rPr = factory.createRPr(); } org.docx4j.wml.BooleanDefaultTrue boldOn = factory.createBooleanDefaultTrue(); boldOn.setVal(Boolean.TRUE); rPr.setB(boldOn); } //Process image if (doc instanceof HWPFDocument && ((HWPFDocument) doc).getPicturesTable().hasPicture(run)) { Picture picture = doc.getPicturesTable().extractPicture(run, true); Inline inline; try { BinaryPartAbstractImage imagePart = BinaryPartAbstractImage.createImagePart(wordMLPackage, picture.getContent()); long cx = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getWidthMpt() * ((double) picture.getHorizontalScalingFactor() * 0.00001d))) * 2L; long cy = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getHeightMpt() * ((double) picture.getVerticalScalingFactor() * 0.00001d))) * 2L; inline = imagePart.createImageInline(null, "", ID1++, ID2++, cx, cy, false); org.docx4j.wml.R imgrun = factory.createR(); org.docx4j.wml.Drawing drawing = factory.createDrawing(); imgrun.getContent().add(drawing); drawing.getAnchorOrInline().add(inline); wmlP.getContent().add(imgrun); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else { // character run text String text = run.text(); // show us the text log.debug("Processing: " + text); String cleansed = stripNonValidXMLCharacters(text); // Necessary to avoid org.xml.sax.SAXParseException: An invalid // XML character // (Unicode: 0xb) was found in the element content of the // document. // when trying to open the resulting docx. // ie JAXB happily writes (marshals) it, but doesn't want to // unmarshall. if (!text.equals(cleansed)) { log.warn("Cleansed.."); } org.docx4j.wml.Text t = factory.createText(); t.setValue(cleansed); org.docx4j.wml.R wmlRun = factory.createR(); if (rPr != null) { wmlRun.setRPr(rPr); } wmlRun.getRunContent().add(t); wmlP.getParagraphContent().add(wmlRun); } } System.out.println(XmlUtils.marshaltoString(wmlP, true, true)); return wmlP; }
From source file:org.opf_labs.aqua.OfficeAnalyser.java
License:Apache License
public static void main(String[] args) throws Exception { //import org.apache.poi.poifs.dev.POIFSDump; //POIFSDump.main(args); SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument( SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1", "UTF-8", true); xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level SMOutputElement xmlroot = xmldoc.addElement("properties"); // Loop through arguments: for (int i = 0; i < args.length; i++) { SMOutputElement xd = xmlroot.addElement("document"); xd.addAttribute("href", args[i]); HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i])); // SummaryInformation SMOutputElement sie = xd.addElement("SummaryInformation"); sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName()); sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion()); sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor()); sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount()); sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments()); sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime()); sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat()); sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords()); sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor()); sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount()); sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber()); sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount()); sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity()); sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject()); sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate()); sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle()); sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount()); sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime()); sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted()); sie.addElement("LastSaveDateTime") .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime()); sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail()); // TextTable SMOutputElement tte = xd.addElement("TextTable"); for (TextPiece tp : doc.getTextTable().getTextPieces()) { SMOutputElement tpe = tte.addElement("TextPiece"); tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode()); tpe.addCharacters(tp.getStringBuilder().toString()); }// w w w . j a v a 2 s . c o m // DocumentSummaryInformation SMOutputElement dsie = xd.addElement("DocumentSummaryInformation"); dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount()); dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount()); dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount()); dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount()); dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount()); dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount()); dsie.addElement("SectionCount") .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount()); dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount()); dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat()); dsie.addElement("PresentationFormat") .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat()); dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany()); dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory()); // Sections for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; SMOutputElement se = dsie.addElement("Section"); se.addElement("FormatID").addCharacters("" + s.getFormatID()); se.addElement("CodePage").addCharacters("" + s.getCodepage()); se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount()); for (Property sp : s.getProperties()) { SMOutputElement pe = se.addElement("Property"); pe.addAttribute("class", sp.getValue().getClass().getCanonicalName()); pe.addCharacters(sp.getValue().toString()); } } SMOutputElement fte = xd.addElement("FontTable"); for (Ffn f : doc.getFontTable().getFontNames()) { SMOutputElement fe = fte.addElement("Font"); fe.addElement("MainFontName").addCharacters(f.getMainFontName()); try { fe.addElement("AltFontName").addCharacters(f.getAltFontName()); } catch (Exception e) { // Seems to fail, and no safe test found as yet. } fe.addElement("Size").addCharacters("" + f.getSize()); fe.addElement("Weight").addCharacters("" + f.getWeight()); } SMOutputElement pte = xd.addElement("PicturesTable"); for (Picture p : doc.getPicturesTable().getAllPictures()) { SMOutputElement pe = pte.addElement("Picture"); pe.addElement("MimeType").addCharacters(p.getMimeType()); pe.addElement("Width").addCharacters("" + p.getWidth()); pe.addElement("Height").addCharacters("" + p.getHeight()); pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor()); pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor()); pe.addElement("Content").addCharacters("" + p.getContent()); } //parseCompObj( new File(args[i]) ); // This //System.out.println("Dumping " + args[i]); FileInputStream is = new FileInputStream(args[i]); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); DirectoryEntry root = fs.getRoot(); //dump(root); xmldoc.closeRoot(); // important, flushes, closes output } }
From source file:org.sleuthkit.autopsy.imageExtractor.ImageExtractor.java
private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) { // TODO check for BBArtifact ENCRYPTION_DETECTED? Might be detected elsewhere...? List<ExtractedImage> listOfExtractedImages = new ArrayList<ExtractedImage>(); String parentFileName = getUniqueName(af); HWPFDocument docA = null; try {/* ww w .j a va2 s .c o m*/ docA = new HWPFDocument(new ReadContentInputStream(af)); } catch (IOException ex) { logger.log(Level.WARNING, "HWPFDocument container could not be instantiated while reading " + af.getName(), ex); return null; } PicturesTable pictureTable = docA.getPicturesTable(); List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = pictureTable.getAllPictures(); String outputFolderPath; if (listOfAllPictures.isEmpty()) { return null; } else { outputFolderPath = getOutputFolderPath(parentFileName); } if (outputFolderPath == null) { logger.log(Level.WARNING, "Could not get path for image extraction from AbstractFile: {0}", af.getName()); return null; } for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) { FileOutputStream fos = null; String fileName = picture.suggestFullFileName(); try { fos = new FileOutputStream(outputFolderPath + File.separator + fileName); } catch (FileNotFoundException ex) { logger.log(Level.WARNING, "Invalid path provided for image extraction", ex); continue; } try { fos.write(picture.getContent()); fos.close(); } catch (IOException ex) { logger.log(Level.WARNING, "Could not write to the provided location", ex); continue; } // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime String fileRelativePath = File.separator + moduleDirRelative + File.separator + parentFileName + File.separator + fileName; long size = picture.getSize(); ExtractedImage extractedimage = new ExtractedImage(fileName, fileRelativePath, size, af); listOfExtractedImages.add(extractedimage); } return listOfExtractedImages; }
From source file:org.sleuthkit.autopsy.modules.embeddedfileextractor.ImageExtractor.java
License:Open Source License
/** * Extract images from doc format files. * * @param af the file from which images are to be extracted. * * @return list of extracted images. Returns null in case no images were * extracted./* w ww.j a va2 s. c o m*/ */ private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) { List<ExtractedImage> listOfExtractedImages; HWPFDocument doc = null; try { doc = new HWPFDocument(new ReadContentInputStream(af)); } catch (Throwable ex) { // instantiating POI containers throw RuntimeExceptions logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()), ex); //NON-NLS return null; } PicturesTable pictureTable = null; List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = null; try { pictureTable = doc.getPicturesTable(); listOfAllPictures = pictureTable.getAllPictures(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } String outputFolderPath; if (listOfAllPictures.isEmpty()) { return null; } else { outputFolderPath = getOutputFolderPath(this.parentFileName); } if (outputFolderPath == null) { return null; } listOfExtractedImages = new ArrayList<>(); byte[] data = null; for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) { String fileName = picture.suggestFullFileName(); try { data = picture.getContent(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data); // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime listOfExtractedImages .add(new ExtractedImage(fileName, getFileRelativePath(fileName), picture.getSize(), af)); } return listOfExtractedImages; }