org.apache.tika.parser.rtf.RTFObjDataParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.rtf.RTFObjDataParser.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.rtf;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.FilenameUtils;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;

/**
 * Many thanks to Simon Mourier for:
 * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
 * and for granting permission to use his code in Tika.
 */
class RTFObjDataParser {

    private final static String WIN_ASCII = "WINDOWS-1252";
    private final int memoryLimitInKb;

    RTFObjDataParser(int memoryLimitInKb) {
        this.memoryLimitInKb = memoryLimitInKb;
    }

    /**
     * Parses the embedded object/pict string
     *
     * @param bytes actual bytes (already converted from the 
     *  hex pair string stored in the embedded object data into actual bytes or read
     *  as raw binary bytes)
     * @return a SimpleRTFEmbObj or null
     * @throws IOException if there are any surprise surprises during parsing
     */

    /**
     * @param bytes
     * @param metadata             incoming metadata
     * @param unknownFilenameCount
     * @return byte[] for contents of obj data
     * @throws IOException
     */
    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount)
            throws IOException, TikaException {
        ByteArrayInputStream is = new ByteArrayInputStream(bytes);
        long version = readUInt(is);
        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));

        long formatId = readUInt(is);
        //2 is an embedded object. 1 is a link.
        if (formatId != 2L) {
            return null;
        }
        String className = readLengthPrefixedAnsiString(is).trim();
        String topicName = readLengthPrefixedAnsiString(is).trim();
        String itemName = readLengthPrefixedAnsiString(is).trim();

        if (className != null && className.length() > 0) {
            metadata.add(RTFMetadata.EMB_CLASS, className);
        }
        if (topicName != null && topicName.length() > 0) {
            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
        }
        if (itemName != null && itemName.length() > 0) {
            metadata.add(RTFMetadata.EMB_ITEM, itemName);
        }

        long dataSz = readUInt(is);

        //readBytes tests for reading too many bytes
        byte[] embObjBytes = readBytes(is, dataSz);

        if (className.toLowerCase(Locale.ROOT).equals("package")) {
            return handlePackage(embObjBytes, metadata);
        } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
            //simple bitmap bytes
            return embObjBytes;
        } else {
            ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
            boolean hasPoifs = false;
            try {
                hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs);
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                return embObjBytes;
            }
            if (hasPoifs) {
                try {
                    return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
                } catch (Exception e) {
                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                }
            }
        }
        return embObjBytes;
    }

    //will throw IOException if not actually POIFS
    //can return null byte[]
    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
            throws IOException {

        byte[] ret = null;
        try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

            DirectoryNode root = fs.getRoot();

            if (root == null) {
                return ret;
            }

            if (root.hasEntry("Package")) {
                Entry ooxml = root.getEntry("Package");
                TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

                ByteArrayOutputStream out = new ByteArrayOutputStream();

                IOUtils.copy(stream, out);
                ret = out.toByteArray();
            } else {
                //try poifs
                POIFSDocumentType type = POIFSDocumentType.detectType(root);
                if (type == POIFSDocumentType.OLE10_NATIVE) {
                    try {
                        // Try to un-wrap the OLE10Native record:
                        Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                        ret = ole.getDataBuffer();
                    } catch (Ole10NativeException ex) {
                        // Not a valid OLE10Native record, skip it
                    }
                } else if (type == POIFSDocumentType.COMP_OBJ) {

                    DocumentEntry contentsEntry;
                    try {
                        contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                    } catch (FileNotFoundException ioe) {
                        contentsEntry = (DocumentEntry) root.getEntry("Contents");
                    }

                    try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                        ret = new byte[contentsEntry.getSize()];
                        inp.readFully(ret);
                    }
                } else {

                    ByteArrayOutputStream out = new ByteArrayOutputStream();
                    is.reset();
                    IOUtils.copy(is, out);
                    ret = out.toByteArray();
                    metadata.set(Metadata.RESOURCE_NAME_KEY,
                            "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                    metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                }
            }
        }
        return ret;
    }

    /**
     * can return null if there is a linked object
     * instead of an embedded file
     */
    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException, TikaException {
        //now parse the package header
        ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
        readUShort(is);

        String displayName = readAnsiString(is);

        //should we add this to the metadata?
        readAnsiString(is); //iconFilePath
        try {
            //iconIndex
            EndianUtils.readUShortBE(is);
        } catch (EndianUtils.BufferUnderrunException e) {
            throw new IOException(e);
        }
        int type = readUShort(is); //type

        //1 is link, 3 is embedded object
        //this only handles embedded objects
        if (type != 3) {
            return null;
        }
        //should we really be ignoring this filePathLen?
        readUInt(is); //filePathLen

        String ansiFilePath = readAnsiString(is); //filePath
        long bytesLen = readUInt(is);
        byte[] objBytes = initByteArray(bytesLen);
        IOUtils.readFully(is, objBytes);
        StringBuilder unicodeFilePath = new StringBuilder();

        try {
            long unicodeLen = readUInt(is);

            for (int i = 0; i < unicodeLen; i++) {
                int lo = is.read();
                int hi = is.read();
                int sum = lo + 256 * hi;
                if (hi == -1 || lo == -1) {
                    //stream ran out; empty SB and stop
                    unicodeFilePath.setLength(0);
                    break;
                }
                unicodeFilePath.append((char) sum);
            }
        } catch (IOException e) {
            //swallow; the unicode file path is optional and might not happen
            unicodeFilePath.setLength(0);
        }
        String fileNameToUse = "";
        String pathToUse = "";
        if (unicodeFilePath.length() > 0) {
            String p = unicodeFilePath.toString();
            fileNameToUse = p;
            pathToUse = p;
        } else {
            fileNameToUse = displayName == null ? "" : displayName;
            pathToUse = ansiFilePath == null ? "" : ansiFilePath;
        }
        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
        metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);

        return objBytes;
    }

    private int readUShort(InputStream is) throws IOException {
        try {
            return EndianUtils.readUShortLE(is);
        } catch (EndianUtils.BufferUnderrunException e) {
            throw new IOException(e);
        }
    }

    private long readUInt(InputStream is) throws IOException {
        try {
            return EndianUtils.readUIntLE(is);
        } catch (EndianUtils.BufferUnderrunException e) {
            throw new IOException(e);
        }
    }

    private String readAnsiString(InputStream is) throws IOException {
        StringBuilder sb = new StringBuilder();
        int c = is.read();
        while (c > 0) {
            sb.append((char) c);
            c = is.read();
        }
        if (c == -1) {
            throw new IOException("Hit end of stream before end of AnsiString");
        }
        return sb.toString();
    }

    private String readLengthPrefixedAnsiString(InputStream is) throws IOException, TikaException {
        long len = readUInt(is);
        byte[] bytes = readBytes(is, len);
        try {
            return new String(bytes, WIN_ASCII);
        } catch (UnsupportedEncodingException e) {
            //shouldn't ever happen
            throw new IOException("Unsupported encoding");
        }
    }

    private byte[] readBytes(InputStream is, long len) throws IOException, TikaException {
        //initByteArray tests for "reading of too many bytes"
        byte[] bytes = initByteArray(len);
        IOUtils.readFully(is, bytes);
        return bytes;
    }

    private byte[] initByteArray(long len) throws IOException, TikaException {
        if (len < 0) {
            throw new IOException("Requested length for reading bytes < 0?!: " + len);
        } else if (memoryLimitInKb > -1 && len > memoryLimitInKb * 1024) {
            throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len
                    + ") bytes), but maximum allowed is (" + (memoryLimitInKb * 1024) + ")."
                    + "If this is a valid RTF file, consider increasing the memory limit via TikaConfig.");
        } else if (len > Integer.MAX_VALUE) {
            throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len
                    + ") bytes), but there is a hard limit of Integer.MAX_VALUE+");
        }

        return new byte[(int) len];

    }
}