com.blm.orc.OrcRawRecordMerger.java Source code

Introduction

Here is the source code for com.blm.orc.OrcRawRecordMerger.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.blm.orc;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
 * Merges a base and a list of delta files together into a single stream of
 * events.
 */
public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct> {

    private static final Log LOG = LogFactory.getLog(OrcRawRecordMerger.class);

    private final Configuration conf;
    private final boolean collapse;
    private final RecordReader baseReader;
    private final long offset;
    private final long length;
    private final ValidTxnList validTxnList;
    private final int columns;
    private ReaderKey prevKey = new ReaderKey();
    // this is the key less than the lowest key we need to process
    private RecordIdentifier minKey;
    // this is the last key we need to process
    private RecordIdentifier maxKey;
    // an extra value so that we can return it while reading ahead
    private OrcStruct extraValue;

    /**
     * A RecordIdentifier extended with the current transaction id. This is the
     * key of our merge sort with the originalTransaction, bucket, and rowId
     * ascending and the currentTransaction descending. This means that if the
     * reader is collapsing events to just the last update, just the first
     * instance of each record is required.
     */
    final static class ReaderKey extends RecordIdentifier {
        private long currentTransactionId;

        public ReaderKey() {
            this(-1, -1, -1, -1);
        }

        public ReaderKey(long originalTransaction, int bucket, long rowId, long currentTransactionId) {
            super(originalTransaction, bucket, rowId);
            this.currentTransactionId = currentTransactionId;
        }

        @Override
        public void set(RecordIdentifier other) {
            super.set(other);
            currentTransactionId = ((ReaderKey) other).currentTransactionId;
        }

        public void setValues(long originalTransactionId, int bucket, long rowId, long currentTransactionId) {
            setValues(originalTransactionId, bucket, rowId);
            this.currentTransactionId = currentTransactionId;
        }

        @Override
        public boolean equals(Object other) {
            return super.equals(other) && currentTransactionId == ((ReaderKey) other).currentTransactionId;
        }

        @Override
        public int compareTo(RecordIdentifier other) {
            int sup = compareToInternal(other);
            if (sup == 0) {
                if (other.getClass() == ReaderKey.class) {
                    ReaderKey oth = (ReaderKey) other;
                    if (currentTransactionId != oth.currentTransactionId) {
                        return currentTransactionId < oth.currentTransactionId ? +1 : -1;
                    }
                } else {
                    return -1;
                }
            }
            return sup;
        }

        public long getCurrentTransactionId() {
            return currentTransactionId;
        }

        /**
         * Compare rows without considering the currentTransactionId.
         * @param other the value to compare to
         * @return -1, 0, +1
         */
        public int compareRow(RecordIdentifier other) {
            return compareToInternal(other);
        }

        @Override
        public String toString() {
            return "{originalTxn: " + getTransactionId() + ", bucket: " + getBucketId() + ", row: " + getRowId()
                    + ", currentTxn: " + currentTransactionId + "}";
        }
    }

    /**
     * A reader and the next record from that reader. The code reads ahead so that
     * we can return the lowest ReaderKey from each of the readers. Thus, the
     * next available row is nextRecord and only following records are still in
     * the reader.
     */
    static class ReaderPair {
        OrcStruct nextRecord;
        final Reader reader;
        final RecordReader recordReader;
        final ReaderKey key;
        final RecordIdentifier maxKey;
        final int bucket;

        /**
         * Create a reader that reads from the first key larger than minKey to any
         * keys equal to maxKey.
         * @param key the key to read into
         * @param reader the ORC file reader
         * @param bucket the bucket number for the file
         * @param minKey only return keys larger than minKey if it is non-null
         * @param maxKey only return keys less than or equal to maxKey if it is
         *               non-null
         * @param options options to provide to read the rows.
         * @throws IOException
         */
        ReaderPair(ReaderKey key, Reader reader, int bucket, RecordIdentifier minKey, RecordIdentifier maxKey,
                ReaderImpl.Options options) throws IOException {
            this.reader = reader;
            this.key = key;
            this.maxKey = maxKey;
            this.bucket = bucket;
            // TODO use stripe statistics to jump over stripes
            recordReader = reader.rowsOptions(options);
            // advance the reader until we reach the minimum key
            do {
                next(nextRecord);
            } while (nextRecord != null && (minKey != null && key.compareRow(minKey) <= 0));
        }

        void next(OrcStruct next) throws IOException {
            if (recordReader.hasNext()) {
                nextRecord = (OrcStruct) recordReader.next(next);
                // set the key
                key.setValues(OrcRecordUpdater.getOriginalTransaction(nextRecord),
                        OrcRecordUpdater.getBucket(nextRecord), OrcRecordUpdater.getRowId(nextRecord),
                        OrcRecordUpdater.getCurrentTransaction(nextRecord));

                // if this record is larger than maxKey, we need to stop
                if (maxKey != null && key.compareRow(maxKey) > 0) {
                    LOG.debug("key " + key + " > maxkey " + maxKey);
                    nextRecord = null;
                    recordReader.close();
                }
            } else {
                nextRecord = null;
                recordReader.close();
            }
        }

        int getColumns() {
            return reader.getTypes().get(OrcRecordUpdater.ROW + 1).getSubtypesCount();
        }
    }

    /**
     * A reader that pretends an original base file is a new version base file.
     * It wraps the underlying reader's row with an ACID event object and
     * makes the relevant translations.
     */
    static final class OriginalReaderPair extends ReaderPair {
        OriginalReaderPair(ReaderKey key, Reader reader, int bucket, RecordIdentifier minKey,
                RecordIdentifier maxKey, Reader.Options options) throws IOException {
            super(key, reader, bucket, minKey, maxKey, options);
        }

        @Override
        void next(OrcStruct next) throws IOException {
            if (recordReader.hasNext()) {
                long nextRowId = recordReader.getRowNumber();
                // have to do initialization here, because the super's constructor
                // calls next and thus we need to initialize before our constructor
                // runs
                if (next == null) {
                    nextRecord = new OrcStruct(OrcRecordUpdater.FIELDS);
                    IntWritable operation = new IntWritable(OrcRecordUpdater.INSERT_OPERATION);
                    nextRecord.setFieldValue(OrcRecordUpdater.OPERATION, operation);
                    nextRecord.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, new LongWritable(0));
                    nextRecord.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, new LongWritable(0));
                    nextRecord.setFieldValue(OrcRecordUpdater.BUCKET, new IntWritable(bucket));
                    nextRecord.setFieldValue(OrcRecordUpdater.ROW_ID, new LongWritable(nextRowId));
                    nextRecord.setFieldValue(OrcRecordUpdater.ROW, recordReader.next(null));
                } else {
                    nextRecord = next;
                    ((IntWritable) next.getFieldValue(OrcRecordUpdater.OPERATION))
                            .set(OrcRecordUpdater.INSERT_OPERATION);
                    ((LongWritable) next.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)).set(0);
                    ((IntWritable) next.getFieldValue(OrcRecordUpdater.BUCKET)).set(bucket);
                    ((LongWritable) next.getFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION)).set(0);
                    ((LongWritable) next.getFieldValue(OrcRecordUpdater.ROW_ID)).set(0);
                    nextRecord.setFieldValue(OrcRecordUpdater.ROW,
                            recordReader.next(OrcRecordUpdater.getRow(next)));
                }
                key.setValues(0L, bucket, nextRowId, 0L);
                if (maxKey != null && key.compareRow(maxKey) > 0) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("key " + key + " > maxkey " + maxKey);
                    }
                    nextRecord = null;
                    recordReader.close();
                }
            } else {
                nextRecord = null;
                recordReader.close();
            }
        }

        @Override
        int getColumns() {
            return reader.getTypes().get(0).getSubtypesCount();
        }
    }

    private final TreeMap<ReaderKey, ReaderPair> readers = new TreeMap<ReaderKey, ReaderPair>();

    // The reader that currently has the lowest key.
    private ReaderPair primary;

    // The key of the next lowest reader.
    private ReaderKey secondaryKey = null;

    /**
     * Find the key range for original bucket files.
     * @param reader the reader
     * @param bucket the bucket number we are reading
     * @param options the options for reading with
     * @throws IOException
     */
    private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options) throws IOException {
        long rowLength = 0;
        long rowOffset = 0;
        long offset = options.getOffset();
        long maxOffset = options.getMaxOffset();
        boolean isTail = true;
        for (StripeInformation stripe : reader.getStripes()) {
            if (offset > stripe.getOffset()) {
                rowOffset += stripe.getNumberOfRows();
            } else if (maxOffset > stripe.getOffset()) {
                rowLength += stripe.getNumberOfRows();
            } else {
                isTail = false;
                break;
            }
        }
        if (rowOffset > 0) {
            minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
        }
        if (!isTail) {
            maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
        }
    }

    /**
     * Find the key range for bucket files.
     * @param reader the reader
     * @param options the options for reading with
     * @throws IOException
     */
    private void discoverKeyBounds(Reader reader, Reader.Options options) throws IOException {
        RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
        long offset = options.getOffset();
        long maxOffset = options.getMaxOffset();
        int firstStripe = 0;
        int stripeCount = 0;
        boolean isTail = true;
        List<StripeInformation> stripes = reader.getStripes();
        for (StripeInformation stripe : stripes) {
            if (offset > stripe.getOffset()) {
                firstStripe += 1;
            } else if (maxOffset > stripe.getOffset()) {
                stripeCount += 1;
            } else {
                isTail = false;
                break;
            }
        }
        if (firstStripe != 0) {
            minKey = keyIndex[firstStripe - 1];
        }
        if (!isTail) {
            maxKey = keyIndex[firstStripe + stripeCount - 1];
        }
    }

    /**
     * Convert from the row include/sarg/columnNames to the event equivalent
     * for the underlying file.
     * @param options options for the row reader
     * @return a cloned options object that is modified for the event reader
     */
    static Reader.Options createEventOptions(Reader.Options options) {
        Reader.Options result = options.clone();
        result.range(options.getOffset(), Long.MAX_VALUE);
        // slide the columns down by 6 for the include array
        if (options.getInclude() != null) {
            boolean[] orig = options.getInclude();
            // we always need the base row
            orig[0] = true;
            boolean[] include = new boolean[orig.length + OrcRecordUpdater.FIELDS];
            Arrays.fill(include, 0, OrcRecordUpdater.FIELDS, true);
            for (int i = 0; i < orig.length; ++i) {
                include[i + OrcRecordUpdater.FIELDS] = orig[i];
            }
            result.include(include);
        }

        // slide the column names down by 6 for the name array
        if (options.getColumnNames() != null) {
            String[] orig = options.getColumnNames();
            String[] cols = new String[orig.length + OrcRecordUpdater.FIELDS];
            for (int i = 0; i < orig.length; ++i) {
                cols[i + OrcRecordUpdater.FIELDS] = orig[i];
            }
            result.searchArgument(options.getSearchArgument(), cols);
        }
        return result;
    }

    /**
     * Create a reader that merge sorts the ACID events together.
     * @param conf the configuration
     * @param collapseEvents should the events on the same row be collapsed
     * @param isOriginal is the base file a pre-acid file
     * @param bucket the bucket we are reading
     * @param options the options to read with
     * @param deltaDirectory the list of delta directories to include
     * @throws IOException
     */
    OrcRawRecordMerger(Configuration conf, boolean collapseEvents, Reader reader, boolean isOriginal, int bucket,
            ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) throws IOException {
        this.conf = conf;
        this.collapse = collapseEvents;
        this.offset = options.getOffset();
        this.length = options.getLength();
        this.validTxnList = validTxnList;
        // modify the optins to reflect the event instead of the base row
        Reader.Options eventOptions = createEventOptions(options);
        if (reader == null) {
            baseReader = null;
        } else {

            // find the min/max based on the offset and length
            if (isOriginal) {
                discoverOriginalKeyBounds(reader, bucket, options);
            } else {
                discoverKeyBounds(reader, options);
            }
            LOG.info("min key = " + minKey + ", max key = " + maxKey);
            // use the min/max instead of the byte range
            ReaderPair pair;
            ReaderKey key = new ReaderKey();
            if (isOriginal) {
                options = options.clone();
                options.range(options.getOffset(), Long.MAX_VALUE);
                pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options);
            } else {
                pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions);
            }

            // if there is at least one record, put it in the map
            if (pair.nextRecord != null) {
                readers.put(key, pair);
            }
            baseReader = pair.recordReader;
        }

        // we always want to read all of the deltas
        eventOptions.range(0, Long.MAX_VALUE);
        // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
        // it can produce wrong results (if the latest valid version of the record is filtered out by
        // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
        eventOptions.searchArgument(null, null);
        if (deltaDirectory != null) {
            for (Path delta : deltaDirectory) {
                ReaderKey key = new ReaderKey();
                Path deltaFile = AcidUtils.createBucketFile(delta, bucket);
                FileSystem fs = deltaFile.getFileSystem(conf);
                long length = getLastFlushLength(fs, deltaFile);
                if (fs.exists(deltaFile) && length != -1) {
                    Reader deltaReader = OrcFile.createReader(deltaFile,
                            OrcFile.readerOptions(conf).maxLength(length));
                    ReaderPair deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, eventOptions);
                    if (deltaPair.nextRecord != null) {
                        readers.put(key, deltaPair);
                    }
                }
            }
        }

        // get the first record
        Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
        if (entry == null) {
            columns = 0;
            primary = null;
        } else {
            primary = entry.getValue();
            if (readers.isEmpty()) {
                secondaryKey = null;
            } else {
                secondaryKey = readers.firstKey();
            }
            // get the number of columns in the user's rows
            columns = primary.getColumns();
        }
    }

    /**
     * Read the side file to get the last flush length.
     * @param fs the file system to use
     * @param deltaFile the path of the delta file
     * @return the maximum size of the file to use
     * @throws IOException
     */
    private static long getLastFlushLength(FileSystem fs, Path deltaFile) throws IOException {
        Path lengths = OrcRecordUpdater.getSideFile(deltaFile);
        long result = Long.MAX_VALUE;
        try {
            FSDataInputStream stream = fs.open(lengths);
            result = -1;
            while (stream.available() > 0) {
                result = stream.readLong();
            }
            stream.close();
            return result;
        } catch (IOException ioe) {
            return result;
        }
    }

    @VisibleForTesting
    RecordIdentifier getMinKey() {
        return minKey;
    }

    @VisibleForTesting
    RecordIdentifier getMaxKey() {
        return maxKey;
    }

    @VisibleForTesting
    ReaderPair getCurrentReader() {
        return primary;
    }

    @VisibleForTesting
    Map<ReaderKey, ReaderPair> getOtherReaders() {
        return readers;
    }

    @Override
    public boolean next(RecordIdentifier recordIdentifier, OrcStruct prev) throws IOException {
        boolean keysSame = true;
        while (keysSame && primary != null) {

            // The primary's nextRecord is the next value to return
            OrcStruct current = primary.nextRecord;
            recordIdentifier.set(primary.key);

            // Advance the primary reader to the next record
            primary.next(extraValue);

            // Save the current record as the new extraValue for next time so that
            // we minimize allocations
            extraValue = current;

            // now that the primary reader has advanced, we need to see if we
            // continue to read it or move to the secondary.
            if (primary.nextRecord == null || primary.key.compareTo(secondaryKey) > 0) {

                // if the primary isn't done, push it back into the readers
                if (primary.nextRecord != null) {
                    readers.put(primary.key, primary);
                }

                // update primary and secondaryKey
                Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
                if (entry != null) {
                    primary = entry.getValue();
                    if (readers.isEmpty()) {
                        secondaryKey = null;
                    } else {
                        secondaryKey = readers.firstKey();
                    }
                } else {
                    primary = null;
                }
            }

            // if this transaction isn't ok, skip over it
            if (!validTxnList.isTxnCommitted(((ReaderKey) recordIdentifier).getCurrentTransactionId())) {
                continue;
            }

            // if we are collapsing, figure out if this is a new row
            if (collapse) {
                keysSame = prevKey.compareRow(recordIdentifier) == 0;
                if (!keysSame) {
                    prevKey.set(recordIdentifier);
                }
            } else {
                keysSame = false;
            }

            // set the output record by fiddling with the pointers so that we can
            // avoid a copy.
            prev.linkFields(current);
        }
        return !keysSame;
    }

    @Override
    public RecordIdentifier createKey() {
        return new ReaderKey();
    }

    @Override
    public OrcStruct createValue() {
        return new OrcStruct(OrcRecordUpdater.FIELDS);
    }

    @Override
    public long getPos() throws IOException {
        return offset + (long) (getProgress() * length);
    }

    @Override
    public void close() throws IOException {
        for (ReaderPair pair : readers.values()) {
            pair.recordReader.close();
        }
    }

    @Override
    public float getProgress() throws IOException {
        return baseReader == null ? 1 : baseReader.getProgress();
    }

    @Override
    public ObjectInspector getObjectInspector() {
        // Read the configuration parameters
        String columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
        // NOTE: if "columns.types" is missing, all columns will be of String type
        String columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);

        // Parse the configuration parameters
        ArrayList<String> columnNames = new ArrayList<String>();
        Deque<Integer> virtualColumns = new ArrayDeque<Integer>();
        if (columnNameProperty != null && columnNameProperty.length() > 0) {
            String[] colNames = columnNameProperty.split(",");
            for (int i = 0; i < colNames.length; i++) {
                if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(colNames[i])) {
                    virtualColumns.addLast(i);
                } else {
                    columnNames.add(colNames[i]);
                }
            }
        }
        if (columnTypeProperty == null) {
            // Default type: all string
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < columnNames.size(); i++) {
                if (i > 0) {
                    sb.append(":");
                }
                sb.append("string");
            }
            columnTypeProperty = sb.toString();
        }

        ArrayList<TypeInfo> fieldTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
        while (virtualColumns.size() > 0) {
            fieldTypes.remove(virtualColumns.removeLast());
        }
        StructTypeInfo rowType = new StructTypeInfo();
        rowType.setAllStructFieldNames(columnNames);
        rowType.setAllStructFieldTypeInfos(fieldTypes);
        return OrcRecordUpdater.createEventSchema(OrcStruct.createObjectInspector(rowType));
    }

    @Override
    public boolean isDelete(OrcStruct value) {
        return OrcRecordUpdater.getOperation(value) == OrcRecordUpdater.DELETE_OPERATION;
    }

    /**
     * Get the number of columns in the underlying rows.
     * @return 0 if there are no base and no deltas.
     */
    public int getColumns() {
        return columns;
    }
}