Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.microsoft; import java.io.IOException; import java.io.InputStream; import java.security.GeneralSecurityException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Locale; import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.poifs.crypt.Decryptor; import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Defines a Microsoft document content extractor. */ public class OfficeParser extends AbstractParser { /** * Serial version UID */ private static final long serialVersionUID = 7393462244028653479L; private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>( Arrays.asList(POIFSDocumentType.WORKBOOK.type, POIFSDocumentType.OLE10_NATIVE.type, POIFSDocumentType.WORDDOCUMENT.type, POIFSDocumentType.UNKNOWN.type, POIFSDocumentType.ENCRYPTED.type, POIFSDocumentType.POWERPOINT.type, POIFSDocumentType.PUBLISHER.type, POIFSDocumentType.PROJECT.type, POIFSDocumentType.VISIO.type, // Works isn't supported POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is POIFSDocumentType.OUTLOOK.type, POIFSDocumentType.SOLIDWORKS_PART.type, POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type, POIFSDocumentType.SOLIDWORKS_DRAWING.type))); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } /** * Extracts properties and text from an MS Document input stream */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); if (tstream == null) { root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot(); } else { final Object container = tstream.getOpenContainer(); if (container instanceof NPOIFSFileSystem) { root = ((NPOIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; } else { NPOIFSFileSystem fs; if (tstream.hasFile()) { fs = new NPOIFSFileSystem(tstream.getFile(), true); } else { fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); } tstream.setOpenContainer(fs); root = fs.getRoot(); } } parse(root, context, metadata, xhtml); xhtml.endDocument(); } protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Parse summary entries first, to make metadata available early new SummaryExtractor(metadata).parseSummaries(root); // Parse remaining document entries POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type != POIFSDocumentType.UNKNOWN) { setType(metadata, type.getType()); } switch (type) { case SOLIDWORKS_PART: case SOLIDWORKS_ASSEMBLY: case SOLIDWORKS_DRAWING: break; case PUBLISHER: PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); xhtml.element("p", publisherTextExtractor.getText()); break; case WORDDOCUMENT: new WordExtractor(context).parse(root, xhtml); break; case POWERPOINT: new HSLFExtractor(context).parse(root, xhtml); break; case WORKBOOK: case XLR: Locale locale = context.get(Locale.class, Locale.getDefault()); new ExcelExtractor(context, metadata).parse(root, xhtml, locale); break; case PROJECT: // We currently can't do anything beyond the metadata break; case VISIO: VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); for (String text : visioTextExtractor.getAllText()) { xhtml.element("p", text); } break; case OUTLOOK: OutlookExtractor extractor = new OutlookExtractor(root, context); extractor.parse(xhtml, metadata); break; case ENCRYPTED: EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { // By default, use the default Office Password String password = Decryptor.DEFAULT_PASSWORD; // If they supplied a Password Provider, ask that for the password, // and use the provider given one if available (stick with default if not) PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { String suppliedPassword = passwordProvider.getPassword(metadata); if (suppliedPassword != null) { password = suppliedPassword; } } // Check if we've the right password or not if (!d.verifyPassword(password)) { throw new EncryptedDocumentException(); } // Decrypt the OLE2 stream, and delegate the resulting OOXML // file to the regular OOXML parser for normal handling OOXMLParser parser = new OOXMLParser(); parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context); } catch (GeneralSecurityException ex) { throw new EncryptedDocumentException(ex); } default: // For unsupported / unhandled types, just the metadata // is extracted, which happened above break; } } private void setType(Metadata metadata, MediaType type) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } public enum POIFSDocumentType { WORKBOOK("xls", MediaType.application("vnd.ms-excel")), OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), WORDDOCUMENT("doc", MediaType.application( "msword")), UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), ENCRYPTED( "ole", MediaType.application("x-tika-ooxml-protected")), POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), PUBLISHER("pub", MediaType.application("x-mspublisher")), PROJECT("mpp", MediaType.application("vnd.ms-project")), VISIO( "vsd", MediaType.application("vnd.visio")), WORKS( "wps", MediaType.application( "vnd.ms-works")), XLR( "xlr", MediaType .application( "x-tika-msworks-spreadsheet")), OUTLOOK( "msg", MediaType .application( "vnd.ms-outlook")), SOLIDWORKS_PART( "sldprt", MediaType .application( "sldworks")), SOLIDWORKS_ASSEMBLY( "sldasm", MediaType .application( "sldworks")), SOLIDWORKS_DRAWING( "slddrw", MediaType .application( "sldworks")); private final String extension; private final MediaType type; POIFSDocumentType(String extension, MediaType type) { this.extension = extension; this.type = type; } public static POIFSDocumentType detectType(POIFSFileSystem fs) { return detectType(fs.getRoot()); } public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { return detectType(fs.getRoot()); } public static POIFSDocumentType detectType(DirectoryEntry node) { Set<String> names = new HashSet<String>(); for (Entry entry : node) { names.add(entry.getName()); } MediaType type = POIFSContainerDetector.detect(names, node); for (POIFSDocumentType poifsType : values()) { if (type.equals(poifsType.type)) { return poifsType; } } return UNKNOWN; } public String getExtension() { return extension; } public MediaType getType() { return type; } } }