com.blm.orc.ReaderImpl.java Source code

Introduction

Here is the source code for com.blm.orc.ReaderImpl.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.blm.orc;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.UserMetadataItem;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.orc.OrcProto;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.protobuf.CodedInputStream;

final class ReaderImpl implements Reader {

    private static final Log LOG = LogFactory.getLog(ReaderImpl.class);

    private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;

    private final FileSystem fileSystem;
    private final Path path;
    private final CompressionKind compressionKind;
    private final CompressionCodec codec;
    private final int bufferSize;
    private OrcProto.Metadata metadata = null;
    private final int metadataSize;
    private final OrcProto.Footer footer;
    private final ObjectInspector inspector;
    private long deserializedSize = -1;
    private final Configuration conf;
    private final List<Integer> versionList;
    private final OrcFile.WriterVersion writerVersion;

    //serialized footer - Keeping this around for use by getFileMetaInfo()
    // will help avoid cpu cycles spend in deserializing at cost of increased
    // memory footprint.
    private final ByteBuffer footerByteBuffer;

    static class StripeInformationImpl implements StripeInformation {
        private final OrcProto.StripeInformation stripe;

        StripeInformationImpl(OrcProto.StripeInformation stripe) {
            this.stripe = stripe;
        }

        @Override
        public long getOffset() {
            return stripe.getOffset();
        }

        @Override
        public long getLength() {
            return stripe.getDataLength() + getIndexLength() + getFooterLength();
        }

        @Override
        public long getDataLength() {
            return stripe.getDataLength();
        }

        @Override
        public long getFooterLength() {
            return stripe.getFooterLength();
        }

        @Override
        public long getIndexLength() {
            return stripe.getIndexLength();
        }

        @Override
        public long getNumberOfRows() {
            return stripe.getNumberOfRows();
        }

        @Override
        public String toString() {
            return "offset: " + getOffset() + " data: " + getDataLength() + " rows: " + getNumberOfRows()
                    + " tail: " + getFooterLength() + " index: " + getIndexLength();
        }
    }

    @Override
    public long getNumberOfRows() {
        return footer.getNumberOfRows();
    }

    @Override
    public List<String> getMetadataKeys() {
        List<String> result = new ArrayList<String>();
        for (OrcProto.UserMetadataItem item : footer.getMetadataList()) {
            result.add(item.getName());
        }
        return result;
    }

    @Override
    public ByteBuffer getMetadataValue(String key) {
        for (OrcProto.UserMetadataItem item : footer.getMetadataList()) {
            if (item.hasName() && item.getName().equals(key)) {
                return item.getValue().asReadOnlyByteBuffer();
            }
        }
        throw new IllegalArgumentException("Can't find user metadata " + key);
    }

    public boolean hasMetadataValue(String key) {
        for (OrcProto.UserMetadataItem item : footer.getMetadataList()) {
            if (item.hasName() && item.getName().equals(key)) {
                return true;
            }
        }
        return false;
    }

    @Override
    public CompressionKind getCompression() {
        return compressionKind;
    }

    @Override
    public int getCompressionSize() {
        return bufferSize;
    }

    @Override
    public List<StripeInformation> getStripes() {
        List<StripeInformation> result = new ArrayList<StripeInformation>();
        for (OrcProto.StripeInformation info : footer.getStripesList()) {
            result.add(new StripeInformationImpl(info));
        }
        return result;
    }

    @Override
    public ObjectInspector getObjectInspector() {
        return inspector;
    }

    @Override
    public long getContentLength() {
        return footer.getContentLength();
    }

    @Override
    public List<OrcProto.Type> getTypes() {
        return footer.getTypesList();
    }

    @Override
    public OrcFile.Version getFileVersion() {
        for (OrcFile.Version version : OrcFile.Version.values()) {
            if (version.getMajor() == versionList.get(0) && version.getMinor() == versionList.get(1)) {
                return version;
            }
        }
        return OrcFile.Version.V_0_11;
    }

    @Override
    public OrcFile.WriterVersion getWriterVersion() {
        return writerVersion;
    }

    @Override
    public int getRowIndexStride() {
        return footer.getRowIndexStride();
    }

    @Override
    public ColumnStatistics[] getStatistics() {
        ColumnStatistics[] result = new ColumnStatistics[footer.getTypesCount()];
        for (int i = 0; i < result.length; ++i) {
            result[i] = ColumnStatisticsImpl.deserialize(footer.getStatistics(i));
        }
        return result;
    }

    /**
     * Ensure this is an ORC file to prevent users from trying to read text
     * files or RC files as ORC files.
     * @param in the file being read
     * @param path the filename for error messages
     * @param psLen the postscript length
     * @param buffer the tail of the file
     * @throws IOException
     */
    static void ensureOrcFooter(FSDataInputStream in, Path path, int psLen, ByteBuffer buffer) throws IOException {
        int len = OrcFile.MAGIC.length();
        if (psLen < len + 1) {
            throw new IOException("Malformed ORC file " + path + ". Invalid postscript length " + psLen);
        }
        int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - 1 - len;
        byte[] array = buffer.array();
        // now look for the magic string at the end of the postscript.
        if (!Text.decode(array, offset, len).equals(OrcFile.MAGIC)) {
            // If it isn't there, this may be the 0.11.0 version of ORC.
            // Read the first 3 bytes of the file to check for the header
            in.seek(0);
            byte[] header = new byte[len];
            in.readFully(header, 0, len);
            // if it isn't there, this isn't an ORC file
            if (!Text.decode(header, 0, len).equals(OrcFile.MAGIC)) {
                throw new IOException("Malformed ORC file " + path + ". Invalid postscript.");
            }
        }
    }

    /**
     * Build a version string out of an array.
     * @param version the version number as a list
     * @return the human readable form of the version string
     */
    private static String versionString(List<Integer> version) {
        StringBuilder buffer = new StringBuilder();
        for (int i = 0; i < version.size(); ++i) {
            if (i != 0) {
                buffer.append('.');
            }
            buffer.append(version.get(i));
        }
        return buffer.toString();
    }

    /**
     * Check to see if this ORC file is from a future version and if so,
     * warn the user that we may not be able to read all of the column encodings.
     * @param log the logger to write any error message to
     * @param path the filename for error messages
     * @param version the version of hive that wrote the file.
     */
    static void checkOrcVersion(Log log, Path path, List<Integer> version) {
        if (version.size() >= 1) {
            int major = version.get(0);
            int minor = 0;
            if (version.size() >= 2) {
                minor = version.get(1);
            }
            if (major > OrcFile.Version.CURRENT.getMajor() || (major == OrcFile.Version.CURRENT.getMajor()
                    && minor > OrcFile.Version.CURRENT.getMinor())) {
                log.warn("ORC file " + path + " was written by a future Hive version " + versionString(version)
                        + ". This file may not be readable by this version of Hive.");
            }
        }
    }

    /**
    * Constructor that let's the user specify additional options.
     * @param path pathname for file
     * @param options options for reading
     * @throws IOException
     */
    ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
        FileSystem fs = options.getFilesystem();
        if (fs == null) {
            fs = path.getFileSystem(options.getConfiguration());
        }
        this.fileSystem = fs;
        this.path = path;
        this.conf = options.getConfiguration();

        FileMetaInfo footerMetaData;
        if (options.getFileMetaInfo() != null) {
            footerMetaData = options.getFileMetaInfo();
        } else {
            footerMetaData = extractMetaInfoFromFooter(fs, path, options.getMaxLength());
        }
        MetaInfoObjExtractor rInfo = new MetaInfoObjExtractor(footerMetaData.compressionType,
                footerMetaData.bufferSize, footerMetaData.metadataSize, footerMetaData.footerBuffer);
        this.footerByteBuffer = footerMetaData.footerBuffer;
        this.compressionKind = rInfo.compressionKind;
        this.codec = rInfo.codec;
        this.bufferSize = rInfo.bufferSize;
        this.metadataSize = rInfo.metadataSize;
        this.metadata = rInfo.metadata;
        this.footer = rInfo.footer;
        this.inspector = rInfo.inspector;
        this.versionList = footerMetaData.versionList;
        this.writerVersion = footerMetaData.writerVersion;
    }

    /**
     * Get the WriterVersion based on the ORC file postscript.
     * @param writerVersion the integer writer version
     * @return
     */
    static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
        for (OrcFile.WriterVersion version : OrcFile.WriterVersion.values()) {
            if (version.getId() == writerVersion) {
                return version;
            }
        }
        return OrcFile.WriterVersion.ORIGINAL;
    }

    private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, Path path, long maxFileLength)
            throws IOException {
        FSDataInputStream file = fs.open(path);

        // figure out the size of the file using the option or filesystem
        long size;
        if (maxFileLength == Long.MAX_VALUE) {
            size = fs.getFileStatus(path).getLen();
        } else {
            size = maxFileLength;
        }

        //read last bytes into buffer to get PostScript
        int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
        file.seek(size - readSize);
        ByteBuffer buffer = ByteBuffer.allocate(readSize);
        file.readFully(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());

        //read the PostScript
        //get length of PostScript
        int psLen = buffer.get(readSize - 1) & 0xff;
        ensureOrcFooter(file, path, psLen, buffer);
        int psOffset = readSize - 1 - psLen;
        CodedInputStream in = CodedInputStream.newInstance(buffer.array(), buffer.arrayOffset() + psOffset, psLen);
        OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);

        checkOrcVersion(LOG, path, ps.getVersionList());

        int footerSize = (int) ps.getFooterLength();
        int metadataSize = (int) ps.getMetadataLength();
        OrcFile.WriterVersion writerVersion;
        if (ps.hasWriterVersion()) {
            writerVersion = getWriterVersion(ps.getWriterVersion());
        } else {
            writerVersion = OrcFile.WriterVersion.ORIGINAL;
        }

        //check compression codec
        switch (ps.getCompression()) {
        case NONE:
            break;
        case ZLIB:
            break;
        case SNAPPY:
            break;
        case LZO:
            break;
        default:
            throw new IllegalArgumentException("Unknown compression");
        }

        //check if extra bytes need to be read
        int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
        if (extra > 0) {
            //more bytes need to be read, seek back to the right place and read extra bytes
            file.seek(size - readSize - extra);
            ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
            file.readFully(extraBuf.array(), extraBuf.arrayOffset() + extraBuf.position(), extra);
            extraBuf.position(extra);
            //append with already read bytes
            extraBuf.put(buffer);
            buffer = extraBuf;
            buffer.position(0);
            buffer.limit(footerSize + metadataSize);
        } else {
            //footer is already in the bytes in buffer, just adjust position, length
            buffer.position(psOffset - footerSize - metadataSize);
            buffer.limit(psOffset);
        }

        // remember position for later
        buffer.mark();

        file.close();

        return new FileMetaInfo(ps.getCompression().toString(), (int) ps.getCompressionBlockSize(),
                (int) ps.getMetadataLength(), buffer, ps.getVersionList(), writerVersion);
    }

    /**
     * MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
     *  from serialized fields.
     * As the fields are final, the fields need to be initialized in the constructor and
     *  can't be done in some helper function. So this helper class is used instead.
     *
     */
    private static class MetaInfoObjExtractor {
        final CompressionKind compressionKind;
        final CompressionCodec codec;
        final int bufferSize;
        final int metadataSize;
        final OrcProto.Metadata metadata;
        final OrcProto.Footer footer;
        final ObjectInspector inspector;

        MetaInfoObjExtractor(String codecStr, int bufferSize, int metadataSize, ByteBuffer footerBuffer)
                throws IOException {

            this.compressionKind = CompressionKind.valueOf(codecStr);
            this.bufferSize = bufferSize;
            this.codec = WriterImpl.createCodec(compressionKind);
            this.metadataSize = metadataSize;

            int position = footerBuffer.position();
            int footerBufferSize = footerBuffer.limit() - footerBuffer.position() - metadataSize;
            footerBuffer.limit(position + metadataSize);

            InputStream instream = InStream.create("metadata", new ByteBuffer[] { footerBuffer }, new long[] { 0L },
                    metadataSize, codec, bufferSize);
            this.metadata = OrcProto.Metadata.parseFrom(instream);

            footerBuffer.position(position + metadataSize);
            footerBuffer.limit(position + metadataSize + footerBufferSize);
            instream = InStream.create("footer", new ByteBuffer[] { footerBuffer }, new long[] { 0L },
                    footerBufferSize, codec, bufferSize);
            this.footer = OrcProto.Footer.parseFrom(instream);

            footerBuffer.position(position);
            this.inspector = OrcStruct.createObjectInspector(0, footer.getTypesList());
        }
    }

    /**
     * FileMetaInfo - represents file metadata stored in footer and postscript sections of the file
     * that is useful for Reader implementation
     *
     */
    static class FileMetaInfo {
        final String compressionType;
        final int bufferSize;
        final int metadataSize;
        final ByteBuffer footerBuffer;
        final List<Integer> versionList;
        final OrcFile.WriterVersion writerVersion;

        FileMetaInfo(String compressionType, int bufferSize, int metadataSize, ByteBuffer footerBuffer,
                OrcFile.WriterVersion writerVersion) {
            this(compressionType, bufferSize, metadataSize, footerBuffer, null, writerVersion);
        }

        FileMetaInfo(String compressionType, int bufferSize, int metadataSize, ByteBuffer footerBuffer,
                List<Integer> versionList, OrcFile.WriterVersion writerVersion) {
            this.compressionType = compressionType;
            this.bufferSize = bufferSize;
            this.metadataSize = metadataSize;
            this.footerBuffer = footerBuffer;
            this.versionList = versionList;
            this.writerVersion = writerVersion;
        }
    }

    public FileMetaInfo getFileMetaInfo() {
        return new FileMetaInfo(compressionKind.toString(), bufferSize, metadataSize, footerByteBuffer, versionList,
                writerVersion);
    }

    @Override
    public RecordReader rows() throws IOException {
        return rowsOptions(new Options());
    }

    @Override
    public RecordReader rowsOptions(Options options) throws IOException {
        LOG.info("Reading ORC rows from " + path + " with " + options);
        boolean[] include = options.getInclude();
        // if included columns is null, then include all columns
        if (include == null) {
            include = new boolean[footer.getTypesCount()];
            Arrays.fill(include, true);
            options.include(include);
        }
        return new RecordReaderImpl(this.getStripes(), fileSystem, path, options, footer.getTypesList(), codec,
                bufferSize, footer.getRowIndexStride(), conf);
    }

    @Override
    public RecordReader rows(boolean[] include) throws IOException {
        return rowsOptions(new Options().include(include));
    }

    @Override
    public RecordReader rows(long offset, long length, boolean[] include) throws IOException {
        return rowsOptions(new Options().include(include).range(offset, length));
    }

    @Override
    public RecordReader rows(long offset, long length, boolean[] include, SearchArgument sarg, String[] columnNames)
            throws IOException {
        return rowsOptions(new Options().include(include).range(offset, length).searchArgument(sarg, columnNames));
    }

    @Override
    public long getRawDataSize() {
        // if the deserializedSize is not computed, then compute it, else
        // return the already computed size. since we are reading from the footer
        // we don't have to compute deserialized size repeatedly
        if (deserializedSize == -1) {
            List<OrcProto.ColumnStatistics> stats = footer.getStatisticsList();
            List<Integer> indices = Lists.newArrayList();
            for (int i = 0; i < stats.size(); ++i) {
                indices.add(i);
            }
            deserializedSize = getRawDataSizeFromColIndices(indices);
        }
        return deserializedSize;
    }

    private long getRawDataSizeFromColIndices(List<Integer> colIndices) {
        long result = 0;
        for (int colIdx : colIndices) {
            result += getRawDataSizeOfColumn(colIdx);
        }
        return result;
    }

    private long getRawDataSizeOfColumn(int colIdx) {
        OrcProto.ColumnStatistics colStat = footer.getStatistics(colIdx);
        long numVals = colStat.getNumberOfValues();
        Type type = footer.getTypes(colIdx);

        switch (type.getKind()) {
        case BINARY:
            // old orc format doesn't support binary statistics. checking for binary
            // statistics is not required as protocol buffers takes care of it.
            return colStat.getBinaryStatistics().getSum();
        case STRING:
        case CHAR:
        case VARCHAR:
            // old orc format doesn't support sum for string statistics. checking for
            // existence is not required as protocol buffers takes care of it.

            // ORC strings are deserialized to java strings. so use java data model's
            // string size
            numVals = numVals == 0 ? 1 : numVals;
            int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
            return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
        case TIMESTAMP:
            return numVals * JavaDataModel.get().lengthOfTimestamp();
        case DATE:
            return numVals * JavaDataModel.get().lengthOfDate();
        case DECIMAL:
            return numVals * JavaDataModel.get().lengthOfDecimal();
        case DOUBLE:
        case LONG:
            return numVals * JavaDataModel.get().primitive2();
        case FLOAT:
        case INT:
        case SHORT:
        case BOOLEAN:
        case BYTE:
            return numVals * JavaDataModel.get().primitive1();
        default:
            LOG.debug("Unknown primitive category.");
            break;
        }

        return 0;
    }

    @Override
    public long getRawDataSizeOfColumns(List<String> colNames) {
        List<Integer> colIndices = getColumnIndicesFromNames(colNames);
        return getRawDataSizeFromColIndices(colIndices);
    }

    private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
        // top level struct
        Type type = footer.getTypesList().get(0);
        List<Integer> colIndices = Lists.newArrayList();
        List<String> fieldNames = type.getFieldNamesList();
        int fieldIdx = 0;
        for (String colName : colNames) {
            if (fieldNames.contains(colName)) {
                fieldIdx = fieldNames.indexOf(colName);
            }

            // a single field may span multiple columns. find start and end column
            // index for the requested field
            int idxStart = type.getSubtypes(fieldIdx);

            int idxEnd;

            // if the specified is the last field and then end index will be last
            // column index
            if (fieldIdx + 1 > fieldNames.size() - 1) {
                idxEnd = getLastIdx() + 1;
            } else {
                idxEnd = type.getSubtypes(fieldIdx + 1);
            }

            // if start index and end index are same then the field is a primitive
            // field else complex field (like map, list, struct, union)
            if (idxStart == idxEnd) {
                // simple field
                colIndices.add(idxStart);
            } else {
                // complex fields spans multiple columns
                for (int i = idxStart; i < idxEnd; i++) {
                    colIndices.add(i);
                }
            }
        }
        return colIndices;
    }

    private int getLastIdx() {
        Set<Integer> indices = Sets.newHashSet();
        for (Type type : footer.getTypesList()) {
            indices.addAll(type.getSubtypesList());
        }
        return Collections.max(indices);
    }

    @Override
    public Metadata getMetadata() throws IOException {
        return new Metadata(metadata);
    }

    List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
        return metadata.getStripeStatsList();
    }

    public List<UserMetadataItem> getOrcProtoUserMetadata() {
        return footer.getMetadataList();
    }
}