List of usage examples for org.apache.poi.hwpf HWPFDocument getTextTable
@Override
@Internal
public TextPieceTable getTextTable()
From source file:org.opencrx.kernel.text.WordToText.java
License:BSD License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too.//www . ja v a 2 s . c o m */ public String getTextFromPieces(HWPFDocument doc) { StringBuffer textBuf = new StringBuffer(); Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:org.opf_labs.aqua.OfficeAnalyser.java
License:Apache License
public static void main(String[] args) throws Exception { //import org.apache.poi.poifs.dev.POIFSDump; //POIFSDump.main(args); SMOutputDocument xmldoc = SMOutputFactory.createOutputDocument( SMOutputFactory.getGlobalXMLOutputFactory().createXMLStreamWriter(System.out, "UTF-8"), "1.1", "UTF-8", true); xmldoc.setIndentation("\n ", 1, 2); // for unix linefeed, 2 spaces per level SMOutputElement xmlroot = xmldoc.addElement("properties"); // Loop through arguments: for (int i = 0; i < args.length; i++) { SMOutputElement xd = xmlroot.addElement("document"); xd.addAttribute("href", args[i]); HWPFDocument doc = new HWPFDocument(new FileInputStream(args[i])); // SummaryInformation SMOutputElement sie = xd.addElement("SummaryInformation"); sie.addElement("ApplicationName").addCharacters(doc.getSummaryInformation().getApplicationName()); sie.addElement("OSVersion").addCharacters("" + doc.getSummaryInformation().getOSVersion()); sie.addElement("Author").addCharacters("" + doc.getSummaryInformation().getAuthor()); sie.addElement("CharCount").addCharacters("" + doc.getSummaryInformation().getCharCount()); sie.addElement("Comments").addCharacters("" + doc.getSummaryInformation().getComments()); sie.addElement("EditTime").addCharacters("" + doc.getSummaryInformation().getEditTime()); sie.addElement("Format").addCharacters("" + doc.getSummaryInformation().getFormat()); sie.addElement("Keywords").addCharacters("" + doc.getSummaryInformation().getKeywords()); sie.addElement("LastAuthor").addCharacters("" + doc.getSummaryInformation().getLastAuthor()); sie.addElement("PageCount").addCharacters("" + doc.getSummaryInformation().getPageCount()); sie.addElement("RevNumber").addCharacters("" + doc.getSummaryInformation().getRevNumber()); sie.addElement("SectionCount").addCharacters("" + doc.getSummaryInformation().getSectionCount()); sie.addElement("Security").addCharacters("" + doc.getSummaryInformation().getSecurity()); sie.addElement("Subject").addCharacters("" + doc.getSummaryInformation().getSubject()); sie.addElement("Template").addCharacters("" + doc.getSummaryInformation().getTemplate()); sie.addElement("Title").addCharacters("" + doc.getSummaryInformation().getTitle()); sie.addElement("WordCount").addCharacters("" + doc.getSummaryInformation().getWordCount()); sie.addElement("CreatedDateTime").addCharacters("" + doc.getSummaryInformation().getCreateDateTime()); sie.addElement("LastPrinted").addCharacters("" + doc.getSummaryInformation().getLastPrinted()); sie.addElement("LastSaveDateTime") .addCharacters("" + doc.getSummaryInformation().getLastSaveDateTime()); sie.addElement("Thumbnail").addCharacters("" + doc.getSummaryInformation().getThumbnail()); // TextTable SMOutputElement tte = xd.addElement("TextTable"); for (TextPiece tp : doc.getTextTable().getTextPieces()) { SMOutputElement tpe = tte.addElement("TextPiece"); tpe.addAttribute("isUnicode", "" + tp.getPieceDescriptor().isUnicode()); tpe.addCharacters(tp.getStringBuilder().toString()); }// w ww. j a va 2s . co m // DocumentSummaryInformation SMOutputElement dsie = xd.addElement("DocumentSummaryInformation"); dsie.addElement("ParCount").addCharacters("" + doc.getDocumentSummaryInformation().getParCount()); dsie.addElement("ByteCount").addCharacters("" + doc.getDocumentSummaryInformation().getByteCount()); dsie.addElement("HiddenCount").addCharacters("" + doc.getDocumentSummaryInformation().getHiddenCount()); dsie.addElement("LineCount").addCharacters("" + doc.getDocumentSummaryInformation().getLineCount()); dsie.addElement("MMClipCount").addCharacters("" + doc.getDocumentSummaryInformation().getMMClipCount()); dsie.addElement("NoteCount").addCharacters("" + doc.getDocumentSummaryInformation().getNoteCount()); dsie.addElement("SectionCount") .addCharacters("" + doc.getDocumentSummaryInformation().getSectionCount()); dsie.addElement("SlideCount").addCharacters("" + doc.getDocumentSummaryInformation().getSlideCount()); dsie.addElement("Format").addCharacters("" + doc.getDocumentSummaryInformation().getFormat()); dsie.addElement("PresentationFormat") .addCharacters("" + doc.getDocumentSummaryInformation().getPresentationFormat()); dsie.addElement("Company").addCharacters("" + doc.getDocumentSummaryInformation().getCompany()); dsie.addElement("Category").addCharacters("" + doc.getDocumentSummaryInformation().getCategory()); // Sections for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; SMOutputElement se = dsie.addElement("Section"); se.addElement("FormatID").addCharacters("" + s.getFormatID()); se.addElement("CodePage").addCharacters("" + s.getCodepage()); se.addElement("PropertyCount").addCharacters("" + s.getPropertyCount()); for (Property sp : s.getProperties()) { SMOutputElement pe = se.addElement("Property"); pe.addAttribute("class", sp.getValue().getClass().getCanonicalName()); pe.addCharacters(sp.getValue().toString()); } } SMOutputElement fte = xd.addElement("FontTable"); for (Ffn f : doc.getFontTable().getFontNames()) { SMOutputElement fe = fte.addElement("Font"); fe.addElement("MainFontName").addCharacters(f.getMainFontName()); try { fe.addElement("AltFontName").addCharacters(f.getAltFontName()); } catch (Exception e) { // Seems to fail, and no safe test found as yet. } fe.addElement("Size").addCharacters("" + f.getSize()); fe.addElement("Weight").addCharacters("" + f.getWeight()); } SMOutputElement pte = xd.addElement("PicturesTable"); for (Picture p : doc.getPicturesTable().getAllPictures()) { SMOutputElement pe = pte.addElement("Picture"); pe.addElement("MimeType").addCharacters(p.getMimeType()); pe.addElement("Width").addCharacters("" + p.getWidth()); pe.addElement("Height").addCharacters("" + p.getHeight()); pe.addElement("HorizontalScalingFactor").addCharacters("" + p.getHorizontalScalingFactor()); pe.addElement("VerticalScalingFactor").addCharacters("" + p.getVerticalScalingFactor()); pe.addElement("Content").addCharacters("" + p.getContent()); } //parseCompObj( new File(args[i]) ); // This //System.out.println("Dumping " + args[i]); FileInputStream is = new FileInputStream(args[i]); POIFSFileSystem fs = new POIFSFileSystem(is); is.close(); DirectoryEntry root = fs.getRoot(); //dump(root); xmldoc.closeRoot(); // important, flushes, closes output } }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too.//from w ww. ja v a 2 s . c om */ public static String getWordTextFromPieces(HWPFDocument doc) { StringBuilder textBuf = new StringBuilder(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:uk.bl.wa.tika.parser.ole2.OLE2Parser.java
License:Open Source License
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { HWPFDocument doc = new HWPFDocument(stream); System.out.println("ApplicationName: " + doc.getSummaryInformation().getApplicationName()); System.out.println("OSVersion: " + doc.getSummaryInformation().getOSVersion()); System.out.println("# paragraphs: " + doc.getDocumentSummaryInformation().getParCount()); System.out.println("# bytes: " + doc.getDocumentSummaryInformation().getByteCount()); System.out.println("# hidden: " + doc.getDocumentSummaryInformation().getHiddenCount()); System.out.println("# lines: " + doc.getDocumentSummaryInformation().getLineCount()); System.out.println("# mmclips: " + doc.getDocumentSummaryInformation().getMMClipCount()); System.out.println("# notes: " + doc.getDocumentSummaryInformation().getNoteCount()); System.out.println("# sections: " + doc.getDocumentSummaryInformation().getSectionCount()); System.out.println("# slides: " + doc.getDocumentSummaryInformation().getSlideCount()); System.out.println("format: " + doc.getDocumentSummaryInformation().getFormat()); for (TextPiece tp : doc.getTextTable().getTextPieces()) { System.out.println("TP: " + tp.getStringBuffer().substring(0, 100)); System.out.println("TP: " + tp.getPieceDescriptor().isUnicode()); }/*from www. ja va 2 s .c o m*/ for (Object os : doc.getDocumentSummaryInformation().getSections()) { Section s = (Section) os; System.out.println("ss# fid: " + s.getFormatID()); System.out.println("ss# codepage: " + s.getCodepage()); System.out.println("ss# # properties: " + s.getPropertyCount()); for (Property sp : s.getProperties()) { System.out.println( "ss# property: " + sp.getValue().getClass().getCanonicalName() + " " + sp.getValue()); } } for (Ffn f : doc.getFontTable().getFontNames()) { System.out.println("Font: " + f.getMainFontName() + ", " + f.getSize() + ", " + f.getWeight()); } parseCompObj(stream); // This POIFSFileSystem fs = new POIFSFileSystem(stream); DirectoryEntry root = fs.getRoot(); dump(root); }