Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:org.opf_labs.aqua.OfficeAnalyser.java

License:Apache License

public static void main(String[] args) throws Exception {
    //import org.apache.poi.poifs.dev.POIFSDump;
    //POIFSDump.main(args);

    SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument(
            SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1",
            "UTF-8", true);

    xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level

    SMOutputElement xmlroot = xmldoc.addElement("properties");

    // Loop through arguments:
    for (int i = 0; i < args.length; i++) {
        SMOutputElement xd = xmlroot.addElement("document");
        xd.addAttribute("href", args[i]);
        HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i]));

        // SummaryInformation
        SMOutputElement sie = xd.addElement("SummaryInformation");
        sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName());
        sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion());
        sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor());
        sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount());
        sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments());
        sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime());
        sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat());
        sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords());
        sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor());
        sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount());
        sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber());
        sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount());
        sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity());
        sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject());
        sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate());
        sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle());
        sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount());
        sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime());
        sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted());
        sie.addElement("LastSaveDateTime")
                .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime());
        sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail());

        // TextTable
        SMOutputElement tte = xd.addElement("TextTable");
        for (TextPiece tp : doc.getTextTable().getTextPieces()) {
            SMOutputElement tpe = tte.addElement("TextPiece");
            tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode());
            tpe.addCharacters(tp.getStringBuilder().toString());
        }// w w  w.  j  ava 2  s .co m

        // DocumentSummaryInformation
        SMOutputElement dsie = xd.addElement("DocumentSummaryInformation");
        dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount());
        dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount());
        dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount());
        dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount());
        dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount());
        dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount());
        dsie.addElement("SectionCount")
                .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount());
        dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount());
        dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat());
        dsie.addElement("PresentationFormat")
                .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat());
        dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany());
        dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory());
        // Sections
        for (Object os : doc.getDocumentSummaryInformation().getSections()) {
            Section s = (Section) os;
            SMOutputElement se = dsie.addElement("Section");
            se.addElement("FormatID").addCharacters("" + s.getFormatID());
            se.addElement("CodePage").addCharacters("" + s.getCodepage());
            se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount());
            for (Property sp : s.getProperties()) {
                SMOutputElement pe = se.addElement("Property");
                pe.addAttribute("class", sp.getValue().getClass().getCanonicalName());
                pe.addCharacters(sp.getValue().toString());
            }
        }
        SMOutputElement fte = xd.addElement("FontTable");
        for (Ffn f : doc.getFontTable().getFontNames()) {
            SMOutputElement fe = fte.addElement("Font");
            fe.addElement("MainFontName").addCharacters(f.getMainFontName());
            try {
                fe.addElement("AltFontName").addCharacters(f.getAltFontName());
            } catch (Exception e) {
                // Seems to fail, and no safe test found as yet.
            }
            fe.addElement("Size").addCharacters("" + f.getSize());
            fe.addElement("Weight").addCharacters("" + f.getWeight());
        }
        SMOutputElement pte = xd.addElement("PicturesTable");
        for (Picture p : doc.getPicturesTable().getAllPictures()) {
            SMOutputElement pe = pte.addElement("Picture");
            pe.addElement("MimeType").addCharacters(p.getMimeType());
            pe.addElement("Width").addCharacters("" + p.getWidth());
            pe.addElement("Height").addCharacters("" + p.getHeight());
            pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor());
            pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor());
            pe.addElement("Content").addCharacters("" + p.getContent());
        }
        //parseCompObj( new File(args[i]) );

        // This
        //System.out.println("Dumping " + args[i]);
        FileInputStream is = new FileInputStream(args[i]);
        POIFSFileSystem fs = new POIFSFileSystem(is);
        is.close();

        DirectoryEntry root = fs.getRoot();

        //dump(root);

        xmldoc.closeRoot(); // important, flushes, closes output

    }
}

From source file:org.paxle.parser.msoffice.impl.MsWordParser.java

License:Open Source License

@Override
protected void extractText(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException, IOException {
    // extract plain text
    final HWPFDocument doc = new HWPFDocument(fs);

    final Range r = doc.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        // get next paragraph 
        final Paragraph p = r.getParagraph(i);

        // append paragraph text
        parserDoc.append(p.text());/*  w  w  w  .j ava2s  .  c om*/
        // we know that this is the end of a block of text, so we can include a separator
        parserDoc.append(' ');
    }
}

From source file:org.sakaiproject.contentreview.impl.compilatio.CompilatioContentValidator.java

License:Educational Community License

private int wordDocLength(ContentResource resource) {
    if (!serverConfigurationService.getBoolean("tii.checkWordLength", false))
        return 100;

    try {/*from  w ww.j a v a2 s .  c  o  m*/
        POIFSFileSystem pfs = new POIFSFileSystem(resource.streamContent());
        HWPFDocument doc = new HWPFDocument(pfs);
        SummaryInformation dsi = doc.getSummaryInformation();
        int count = dsi.getWordCount();
        log.debug("got a count of " + count);
        //if this == 0 then its likely that something went wrong -poi couldn't read it
        if (count == 0)
            return 100;
        return count;
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (ServerOverloadException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    //in case we can't read this lets err on the side of caution
    return 100;
}

From source file:org.shareok.data.documentProcessor.WordHandler.java

private String[] readDocFile(FileInputStream fs) throws IOException {

    String[] paragraphs = null;/*  w ww  .  j  a  v  a 2 s  . com*/
    try {
        HWPFDocument doc = new HWPFDocument(fs);
        WordExtractor we = new WordExtractor(doc);
        paragraphs = we.getParagraphText();
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        fs.close();
    }
    return paragraphs;
}

From source file:org.sleuthkit.autopsy.imageExtractor.ImageExtractor.java

private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) {
    // TODO check for BBArtifact ENCRYPTION_DETECTED? Might be detected elsewhere...?
    List<ExtractedImage> listOfExtractedImages = new ArrayList<ExtractedImage>();
    String parentFileName = getUniqueName(af);
    HWPFDocument docA = null;// w  w  w . j  av  a 2  s. c o  m
    try {
        docA = new HWPFDocument(new ReadContentInputStream(af));
    } catch (IOException ex) {
        logger.log(Level.WARNING,
                "HWPFDocument container could not be instantiated while reading " + af.getName(), ex);
        return null;
    }
    PicturesTable pictureTable = docA.getPicturesTable();
    List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = pictureTable.getAllPictures();
    String outputFolderPath;
    if (listOfAllPictures.isEmpty()) {
        return null;
    } else {
        outputFolderPath = getOutputFolderPath(parentFileName);
    }
    if (outputFolderPath == null) {
        logger.log(Level.WARNING, "Could not get path for image extraction from AbstractFile: {0}",
                af.getName());
        return null;
    }
    for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) {
        FileOutputStream fos = null;
        String fileName = picture.suggestFullFileName();
        try {
            fos = new FileOutputStream(outputFolderPath + File.separator + fileName);
        } catch (FileNotFoundException ex) {
            logger.log(Level.WARNING, "Invalid path provided for image extraction", ex);
            continue;
        }
        try {
            fos.write(picture.getContent());
            fos.close();
        } catch (IOException ex) {
            logger.log(Level.WARNING, "Could not write to the provided location", ex);
            continue;
        }
        // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
        String fileRelativePath = File.separator + moduleDirRelative + File.separator + parentFileName
                + File.separator + fileName;
        long size = picture.getSize();
        ExtractedImage extractedimage = new ExtractedImage(fileName, fileRelativePath, size, af);
        listOfExtractedImages.add(extractedimage);
    }

    return listOfExtractedImages;
}

From source file:org.sleuthkit.autopsy.modules.embeddedfileextractor.ImageExtractor.java

License:Open Source License

/**
 * Extract images from doc format files.
 *
 * @param af the file from which images are to be extracted.
 *
 * @return list of extracted images. Returns null in case no images were
 *         extracted./*from  w w w  .j  ava  2  s. c o m*/
 */
private List<ExtractedImage> extractImagesFromDoc(AbstractFile af) {
    List<ExtractedImage> listOfExtractedImages;
    HWPFDocument doc = null;
    try {
        doc = new HWPFDocument(new ReadContentInputStream(af));
    } catch (Throwable ex) {
        // instantiating POI containers throw RuntimeExceptions
        logger.log(Level.WARNING,
                NbBundle.getMessage(this.getClass(),
                        "EmbeddedFileExtractorIngestModule.ImageExtractor.docContainer.init.err", af.getName()),
                ex); //NON-NLS
        return null;
    }

    PicturesTable pictureTable = null;
    List<org.apache.poi.hwpf.usermodel.Picture> listOfAllPictures = null;
    try {
        pictureTable = doc.getPicturesTable();
        listOfAllPictures = pictureTable.getAllPictures();
    } catch (Exception ex) {
        // log internal Java and Apache errors as WARNING
        logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(),
                "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS
        return null;
    }

    String outputFolderPath;
    if (listOfAllPictures.isEmpty()) {
        return null;
    } else {
        outputFolderPath = getOutputFolderPath(this.parentFileName);
    }
    if (outputFolderPath == null) {
        return null;
    }
    listOfExtractedImages = new ArrayList<>();
    byte[] data = null;
    for (org.apache.poi.hwpf.usermodel.Picture picture : listOfAllPictures) {
        String fileName = picture.suggestFullFileName();
        try {
            data = picture.getContent();
        } catch (Exception ex) {
            // log internal Java and Apache errors as WARNING
            logger.log(Level.WARNING,
                    NbBundle.getMessage(this.getClass(),
                            "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()),
                    ex); //NON-NLS
            return null;
        }
        writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data);
        // TODO Extract more info from the Picture viz ctime, crtime, atime, mtime
        listOfExtractedImages
                .add(new ExtractedImage(fileName, getFileRelativePath(fileName), picture.getSize(), af));
    }

    return listOfExtractedImages;
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

public static String getWordTextOld(InputStream is) {
    try {//from  w  w w  .ja v a 2  s.  co  m
        return getWordTextOld(new HWPFDocument(is));
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:poi.hslf.examples.DataExtraction.java

License:Apache License

public static void main(String args[]) throws Exception {

    if (args.length == 0) {
        usage();/* w  w  w . java  2s .c om*/
        return;
    }

    FileInputStream is = new FileInputStream(args[0]);
    SlideShow ppt = new SlideShow(is);
    is.close();

    //extract all sound files embedded in this presentation
    SoundData[] sound = ppt.getSoundData();
    for (int i = 0; i < sound.length; i++) {
        String type = sound[i].getSoundType(); //*.wav
        String name = sound[i].getSoundName(); //typically file name
        byte[] data = sound[i].getData(); //raw bytes

        //save the sound  on disk
        FileOutputStream out = new FileOutputStream(name + type);
        out.write(data);
        out.close();
    }

    //extract embedded OLE documents
    Slide[] slide = ppt.getSlides();
    for (int i = 0; i < slide.length; i++) {
        Shape[] shape = slide[i].getShapes();
        for (int j = 0; j < shape.length; j++) {
            if (shape[j] instanceof OLEShape) {
                OLEShape ole = (OLEShape) shape[j];
                ObjectData data = ole.getObjectData();
                String name = ole.getInstanceName();
                if ("Worksheet".equals(name)) {

                    //read xls
                    HSSFWorkbook wb = new HSSFWorkbook(data.getData());

                } else if ("Document".equals(name)) {
                    HWPFDocument doc = new HWPFDocument(data.getData());
                    //read the word document
                    Range r = doc.getRange();
                    for (int k = 0; k < r.numParagraphs(); k++) {
                        Paragraph p = r.getParagraph(k);
                        System.out.println(p.text());
                    }

                    //save on disk
                    FileOutputStream out = new FileOutputStream(name + "-(" + (j) + ").doc");
                    doc.write(out);
                    out.close();
                } else {
                    FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (j + 1) + ".dat");
                    InputStream dis = data.getData();
                    byte[] chunk = new byte[2048];
                    int count;
                    while ((count = dis.read(chunk)) >= 0) {
                        out.write(chunk, 0, count);
                    }
                    is.close();
                    out.close();
                }
            }

        }
    }

    //Pictures
    for (int i = 0; i < slide.length; i++) {
        Shape[] shape = slide[i].getShapes();
        for (int j = 0; j < shape.length; j++) {
            if (shape[j] instanceof Picture) {
                Picture p = (Picture) shape[j];
                PictureData data = p.getPictureData();
                String name = p.getPictureName();
                int type = data.getType();
                String ext;
                switch (type) {
                case Picture.JPEG:
                    ext = ".jpg";
                    break;
                case Picture.PNG:
                    ext = ".png";
                    break;
                case Picture.WMF:
                    ext = ".wmf";
                    break;
                case Picture.EMF:
                    ext = ".emf";
                    break;
                case Picture.PICT:
                    ext = ".pict";
                    break;
                case Picture.DIB:
                    ext = ".dib";
                    break;
                default:
                    continue;
                }
                FileOutputStream out = new FileOutputStream("pict-" + j + ext);
                out.write(data.getData());
                out.close();
            }

        }
    }

}

From source file:poi.hssf.usermodel.examples.EmeddedObjects.java

License:Apache License

public static void main(String[] args) throws Exception {
    POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(args[0]));
    HSSFWorkbook workbook = new HSSFWorkbook(fs);
    for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
        //the OLE2 Class Name of the object
        String oleName = obj.getOLE2ClassName();
        if (oleName.equals("Worksheet")) {
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
            //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
        } else if (oleName.equals("Document")) {
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            HWPFDocument embeddedWordDocument = new HWPFDocument(dn);
            //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
        } else if (oleName.equals("Presentation")) {
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn));
            //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
        } else {/*from  ww w . j a  v a  2s . co  m*/
            if (obj.hasDirectoryEntry()) {
                // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
                DirectoryNode dn = (DirectoryNode) obj.getDirectory();
                for (Iterator entries = dn.getEntries(); entries.hasNext();) {
                    Entry entry = (Entry) entries.next();
                    //System.out.println(oleName + "." + entry.getName());
                }
            } else {
                // There is no DirectoryEntry
                // Recover the object's data from the HSSFObjectData instance.
                byte[] objectData = obj.getObjectData();
            }
        }
    }
}

From source file:poi.hwpf.Word2Forrest.java

License:Apache License

public static void main(String[] args) {
    try {//  www  .j a va  2 s .  c  o  m
        OutputStream out = new FileOutputStream("c:\\test.xml");

        new Word2Forrest(new HWPFDocument(new FileInputStream(args[0])), out);
        out.close();
    } catch (Throwable t) {
        t.printStackTrace();
    }

}