Example usage for org.apache.poi.hwpf HWPFDocument getTextTable

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument getTextTable.

Prototype

@Override
    @Internal
    public TextPieceTable getTextTable()

Source Link

Usage

From source file:org.opencrx.kernel.text.WordToText.java

License:BSD License

/**
 * Grab the text out of the text pieces. Might also include various
 * bits of crud, but will work in cases where the text piece -> paragraph
 * mapping is broken. Fast too.//www  .  ja  v  a  2  s  .  c o  m
 */
public String getTextFromPieces(HWPFDocument doc) {
    StringBuffer textBuf = new StringBuffer();
    Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = textPieces.next();
        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
        }
    }
    String text = textBuf.toString();
    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");
    if (text.endsWith("\r")) {
        text += "\n";
    }
    return text;
}

From source file:org.opf_labs.aqua.OfficeAnalyser.java

License:Apache License

public static void main(String[] args) throws Exception {
    //import org.apache.poi.poifs.dev.POIFSDump;
    //POIFSDump.main(args);

    SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument(
            SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1",
            "UTF-8", true);

    xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level

    SMOutputElement xmlroot = xmldoc.addElement("properties");

    // Loop through arguments:
    for (int i = 0; i < args.length; i++) {
        SMOutputElement xd = xmlroot.addElement("document");
        xd.addAttribute("href", args[i]);
        HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i]));

        // SummaryInformation
        SMOutputElement sie = xd.addElement("SummaryInformation");
        sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName());
        sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion());
        sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor());
        sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount());
        sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments());
        sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime());
        sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat());
        sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords());
        sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor());
        sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount());
        sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber());
        sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount());
        sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity());
        sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject());
        sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate());
        sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle());
        sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount());
        sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime());
        sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted());
        sie.addElement("LastSaveDateTime")
                .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime());
        sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail());

        // TextTable
        SMOutputElement tte = xd.addElement("TextTable");
        for (TextPiece tp : doc.getTextTable().getTextPieces()) {
            SMOutputElement tpe = tte.addElement("TextPiece");
            tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode());
            tpe.addCharacters(tp.getStringBuilder().toString());
        }//  w ww.  j  a va 2s  . co  m

        // DocumentSummaryInformation
        SMOutputElement dsie = xd.addElement("DocumentSummaryInformation");
        dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount());
        dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount());
        dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount());
        dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount());
        dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount());
        dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount());
        dsie.addElement("SectionCount")
                .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount());
        dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount());
        dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat());
        dsie.addElement("PresentationFormat")
                .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat());
        dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany());
        dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory());
        // Sections
        for (Object os : doc.getDocumentSummaryInformation().getSections()) {
            Section s = (Section) os;
            SMOutputElement se = dsie.addElement("Section");
            se.addElement("FormatID").addCharacters("" + s.getFormatID());
            se.addElement("CodePage").addCharacters("" + s.getCodepage());
            se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount());
            for (Property sp : s.getProperties()) {
                SMOutputElement pe = se.addElement("Property");
                pe.addAttribute("class", sp.getValue().getClass().getCanonicalName());
                pe.addCharacters(sp.getValue().toString());
            }
        }
        SMOutputElement fte = xd.addElement("FontTable");
        for (Ffn f : doc.getFontTable().getFontNames()) {
            SMOutputElement fe = fte.addElement("Font");
            fe.addElement("MainFontName").addCharacters(f.getMainFontName());
            try {
                fe.addElement("AltFontName").addCharacters(f.getAltFontName());
            } catch (Exception e) {
                // Seems to fail, and no safe test found as yet.
            }
            fe.addElement("Size").addCharacters("" + f.getSize());
            fe.addElement("Weight").addCharacters("" + f.getWeight());
        }
        SMOutputElement pte = xd.addElement("PicturesTable");
        for (Picture p : doc.getPicturesTable().getAllPictures()) {
            SMOutputElement pe = pte.addElement("Picture");
            pe.addElement("MimeType").addCharacters(p.getMimeType());
            pe.addElement("Width").addCharacters("" + p.getWidth());
            pe.addElement("Height").addCharacters("" + p.getHeight());
            pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor());
            pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor());
            pe.addElement("Content").addCharacters("" + p.getContent());
        }
        //parseCompObj( new File(args[i]) );

        // This
        //System.out.println("Dumping " + args[i]);
        FileInputStream is = new FileInputStream(args[i]);
        POIFSFileSystem fs = new POIFSFileSystem(is);
        is.close();

        DirectoryEntry root = fs.getRoot();

        //dump(root);

        xmldoc.closeRoot(); // important, flushes, closes output

    }
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

/**
 * Grab the text out of the text pieces. Might also include various
 *  bits of crud, but will work in cases where the text piece -> paragraph
 *  mapping is broken. Fast too.//from  w ww.  ja v  a 2 s  . c om
 */
public static String getWordTextFromPieces(HWPFDocument doc) {
    StringBuilder textBuf = new StringBuilder();

    Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();

        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
            throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
    }

    String text = textBuf.toString();

    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");

    if (text.endsWith("\r")) {
        text += "\n";
    }

    return text;
}

From source file:uk.bl.wa.tika.parser.ole2.OLE2Parser.java

License:Open Source License

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    HWPFDocument doc = new HWPFDocument(stream);
    System.out.println("ApplicationName: " + doc.getSummaryInformation().getApplicationName());
    System.out.println("OSVersion: " + doc.getSummaryInformation().getOSVersion());
    System.out.println("# paragraphs: " + doc.getDocumentSummaryInformation().getParCount());
    System.out.println("# bytes: " + doc.getDocumentSummaryInformation().getByteCount());
    System.out.println("# hidden: " + doc.getDocumentSummaryInformation().getHiddenCount());
    System.out.println("# lines: " + doc.getDocumentSummaryInformation().getLineCount());
    System.out.println("# mmclips: " + doc.getDocumentSummaryInformation().getMMClipCount());
    System.out.println("# notes: " + doc.getDocumentSummaryInformation().getNoteCount());
    System.out.println("# sections: " + doc.getDocumentSummaryInformation().getSectionCount());
    System.out.println("# slides: " + doc.getDocumentSummaryInformation().getSlideCount());
    System.out.println("format: " + doc.getDocumentSummaryInformation().getFormat());
    for (TextPiece tp : doc.getTextTable().getTextPieces()) {
        System.out.println("TP: " + tp.getStringBuffer().substring(0, 100));
        System.out.println("TP: " + tp.getPieceDescriptor().isUnicode());
    }/*from  www.  ja va  2  s  .c o  m*/
    for (Object os : doc.getDocumentSummaryInformation().getSections()) {
        Section s = (Section) os;
        System.out.println("ss# fid: " + s.getFormatID());
        System.out.println("ss# codepage: " + s.getCodepage());
        System.out.println("ss# # properties: " + s.getPropertyCount());
        for (Property sp : s.getProperties()) {
            System.out.println(
                    "ss# property: " + sp.getValue().getClass().getCanonicalName() + " " + sp.getValue());
        }
    }
    for (Ffn f : doc.getFontTable().getFontNames()) {
        System.out.println("Font: " + f.getMainFontName() + ", " + f.getSize() + ", " + f.getWeight());
    }
    parseCompObj(stream);

    // This
    POIFSFileSystem fs = new POIFSFileSystem(stream);

    DirectoryEntry root = fs.getRoot();

    dump(root);

}