com.ezdi.rtf.testRTFParser.RTFObjDataParser.java Source code

Java tutorial

Introduction

Here is the source code for com.ezdi.rtf.testRTFParser.RTFObjDataParser.java

Source

package com.ezdi.rtf.testRTFParser;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.FilenameUtils;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.util.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;

/**
 * Many thanks to Simon Mourier for:
 * http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
 * and for granting permission to use his code in Tika.
 */
/**
 * @author akash.p
 *
 */
class RTFObjDataParser {

    private final static int[] INT_LE_POWS = new int[] { 1, 256, 65536, 16777216 };

    private final static String WIN_ASCII = "WINDOWS-1252";

    /**
     * Parses the embedded object/pict string
     *
     * @param bytes
     *            actual bytes (already converted from the hex pair string
     *            stored in the embedded object data into actual bytes or read
     *            as raw binary bytes)
     * @return a SimpleRTFEmbObj or null
     * @throws IOException
     *             if there are any surprise surprises during parsing
     */

    /**
     * @param bytes
     * @param metadata
     *            incoming metadata
     * @param unknownFilenameCount
     * @return byte[] for contents of obj data
     * @throws IOException
     */
    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {
        ByteArrayInputStream is = new ByteArrayInputStream(bytes);
        long version = readUInt(is);
        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));

        long formatId = readUInt(is);
        // 2 is an embedded object. 1 is a link.
        if (formatId != 2L) {
            return null;
        }
        String className = readLengthPrefixedAnsiString(is).trim();
        String topicName = readLengthPrefixedAnsiString(is).trim();
        String itemName = readLengthPrefixedAnsiString(is).trim();

        if (className != null && className.length() > 0) {
            metadata.add(RTFMetadata.EMB_CLASS, className);
        }
        if (topicName != null && topicName.length() > 0) {
            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
        }
        if (itemName != null && itemName.length() > 0) {
            metadata.add(RTFMetadata.EMB_ITEM, itemName);
        }

        long dataSz = readUInt(is);

        // readBytes tests for reading too many bytes
        byte[] embObjBytes = readBytes(is, dataSz);

        if (className.toLowerCase(Locale.ROOT).equals("package")) {
            return handlePackage(embObjBytes, metadata);
        } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
            // simple bitmap bytes
            return embObjBytes;
        } else {
            ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
            if (NPOIFSFileSystem.hasPOIFSHeader(embIs)) {
                try {
                    return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
                } catch (IOException e) {
                    // swallow
                }
            }
        }
        return embObjBytes;
    }

    // will throw IOException if not actually POIFS
    // can return null byte[]
    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
            throws IOException {

        byte[] ret = null;
        try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

            DirectoryNode root = fs.getRoot();

            if (root == null) {
                return ret;
            }

            if (root.hasEntry("Package")) {
                Entry ooxml = root.getEntry("Package");
                TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

                ByteArrayOutputStream out = new ByteArrayOutputStream();

                IOUtils.copy(stream, out);
                ret = out.toByteArray();
            } else {
                // try poifs
                POIFSDocumentType type = POIFSDocumentType.detectType(root);
                if (type == POIFSDocumentType.OLE10_NATIVE) {
                    try {
                        // Try to un-wrap the OLE10Native record:
                        Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                        ret = ole.getDataBuffer();
                    } catch (Ole10NativeException ex) {
                        // Not a valid OLE10Native record, skip it
                    }
                } else if (type == POIFSDocumentType.COMP_OBJ) {

                    DocumentEntry contentsEntry;
                    try {
                        contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                    } catch (FileNotFoundException ioe) {
                        contentsEntry = (DocumentEntry) root.getEntry("Contents");
                    }

                    try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                        ret = new byte[contentsEntry.getSize()];
                        inp.readFully(ret);
                    }
                } else {

                    ByteArrayOutputStream out = new ByteArrayOutputStream();
                    is.reset();
                    IOUtils.copy(is, out);
                    ret = out.toByteArray();
                    metadata.set(Metadata.RESOURCE_NAME_KEY,
                            "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                    metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                }
            }
        }
        return ret;
    }

    /**
     * can return null if there is a linked object instead of an embedded file
     */
    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException {
        // now parse the package header
        ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
        readUShort(is);

        String displayName = readAnsiString(is);

        // should we add this to the metadata?
        readAnsiString(is); // iconFilePath
        readUShort(is); // iconIndex
        int type = readUShort(is); // type

        // 1 is link, 3 is embedded object
        // this only handles embedded objects
        if (type != 3) {
            return null;
        }
        // should we really be ignoring this filePathLen?
        readUInt(is); // filePathLen

        String ansiFilePath = readAnsiString(is); // filePath
        long bytesLen = readUInt(is);
        byte[] objBytes = initByteArray(bytesLen);
        is.read(objBytes);
        StringBuilder unicodeFilePath = new StringBuilder();

        try {
            long unicodeLen = readUInt(is);

            for (int i = 0; i < unicodeLen; i++) {
                int lo = is.read();
                int hi = is.read();
                int sum = lo + 256 * hi;
                if (hi == -1 || lo == -1) {
                    // stream ran out; empty SB and stop
                    unicodeFilePath.setLength(0);
                    break;
                }
                unicodeFilePath.append((char) sum);
            }
        } catch (IOException e) {
            // swallow; the unicode file path is optional and might not happen
            unicodeFilePath.setLength(0);
        }
        String fileNameToUse = "";
        String pathToUse = "";
        if (unicodeFilePath.length() > 0) {
            String p = unicodeFilePath.toString();
            fileNameToUse = p;
            pathToUse = p;
        } else {
            fileNameToUse = displayName == null ? "" : displayName;
            pathToUse = ansiFilePath == null ? "" : ansiFilePath;
        }
        metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);

        return objBytes;
    }

    private int readUShort(InputStream is) throws IOException {
        int lo = is.read();
        int hi = is.read() * 256;
        if (lo == -1 || hi == -1) {
            throw new IOException("Hit end of stream before reading little endian unsigned short.");
        }
        return hi + lo;
    }

    private long readUInt(InputStream is) throws IOException {
        long sum = 0;
        for (int i = 0; i < 4; i++) {
            int v = is.read();
            if (v == -1) {
                throw new IOException("Hit end of stream before finishing little endian unsigned int.");
            }
            sum += v * (long) INT_LE_POWS[i];
        }
        return sum;
    }

    private String readAnsiString(InputStream is) throws IOException {
        StringBuilder sb = new StringBuilder();
        int c = is.read();
        while (c > 0) {
            sb.append((char) c);
            c = is.read();
        }
        if (c == -1) {
            throw new IOException("Hit end of stream before end of AnsiString");
        }
        return sb.toString();
    }

    private String readLengthPrefixedAnsiString(InputStream is) throws IOException {
        long len = readUInt(is);
        byte[] bytes = readBytes(is, len);
        try {
            return new String(bytes, WIN_ASCII);
        } catch (UnsupportedEncodingException e) {
            // shouldn't ever happen
            throw new IOException("Unsupported encoding");
        }
    }

    private byte[] readBytes(InputStream is, long len) throws IOException {
        // initByteArray tests for "reading of too many bytes"
        byte[] bytes = initByteArray(len);
        int read = is.read(bytes);
        if (read != len) {
            throw new IOException("Hit end of stream before reading all bytes");
        }

        return bytes;
    }

    private byte[] initByteArray(long len) throws IOException {
        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
            throw new IOException("Requested length for reading bytes is out of bounds: " + len);
        }
        return new byte[(int) len];

    }
}