Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.blm.orc; import java.io.IOException; import java.io.OutputStream; import java.lang.management.ManagementFactory; import java.nio.ByteBuffer; import java.sql.Timestamp; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; import java.util.Map; import java.util.TreeMap; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.protobuf.ByteString; import com.google.protobuf.CodedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.orc.OrcProto; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; import org.apache.hadoop.hive.ql.io.orc.OrcProto.StripeStatistics; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.ql.io.orc.OrcProto.UserMetadataItem; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import java.io.IOException; import java.io.OutputStream; import java.lang.management.ManagementFactory; import java.nio.ByteBuffer; import java.sql.Timestamp; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import com.blm.orc.CompressionCodec.Modifier; import com.blm.orc.OrcFile.CompressionStrategy; import com.blm.orc.OrcFile.EncodingStrategy; import static com.google.common.base.Preconditions.checkArgument; /** * An ORC file writer. The file is divided into stripes, which is the natural * unit of work when reading. Each stripe is buffered in memory until the * memory reaches the stripe size and then it is written out broken down by * columns. Each column is written by a TreeWriter that is specific to that * type of column. TreeWriters may have children TreeWriters that handle the * sub-types. Each of the TreeWriters writes the column's data as a set of * streams. * * This class is synchronized so that multi-threaded access is ok. In * particular, because the MemoryManager is shared between writers, this class * assumes that checkMemory may be called from a separate thread. */ class WriterImpl implements Writer, MemoryManager.Callback { private static final Log LOG = LogFactory.getLog(WriterImpl.class); private static final int HDFS_BUFFER_SIZE = 256 * 1024; private static final int MIN_ROW_INDEX_STRIDE = 1000; // threshold above which buffer size will be automatically resized private static final int COLUMN_COUNT_THRESHOLD = 1000; private final FileSystem fs; private final Path path; private final long defaultStripeSize; private long adjustedStripeSize; private final int rowIndexStride; private final CompressionKind compress; private final CompressionCodec codec; private final boolean addBlockPadding; private final int bufferSize; private final long blockSize; private final float paddingTolerance; // the streams that make up the current stripe private final Map<StreamName, BufferedStream> streams = new TreeMap<StreamName, BufferedStream>(); private FSDataOutputStream rawWriter = null; // the compressed metadata information outStream private OutStream writer = null; // a protobuf outStream around streamFactory private CodedOutputStream protobufWriter = null; private long headerLength; private int columnCount; private long rowCount = 0; private long rowsInStripe = 0; private long rawDataSize = 0; private int rowsInIndex = 0; private int stripesAtLastFlush = -1; private final List<OrcProto.StripeInformation> stripes = new ArrayList<OrcProto.StripeInformation>(); private final Map<String, ByteString> userMetadata = new TreeMap<String, ByteString>(); private final StreamFactory streamFactory = new StreamFactory(); private final TreeWriter treeWriter; private final boolean buildIndex; private final MemoryManager memoryManager; private final OrcFile.Version version; private final Configuration conf; private final OrcFile.WriterCallback callback; private final OrcFile.WriterContext callbackContext; private final OrcFile.EncodingStrategy encodingStrategy; private final OrcFile.CompressionStrategy compressionStrategy; WriterImpl(FileSystem fs, Path path, Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride, MemoryManager memoryManager, boolean addBlockPadding, OrcFile.Version version, OrcFile.WriterCallback callback, OrcFile.EncodingStrategy encodingStrategy, CompressionStrategy compressionStrategy, float paddingTolerance, long blockSizeValue) throws IOException { this.fs = fs; this.path = path; this.conf = conf; this.callback = callback; if (callback != null) { callbackContext = new OrcFile.WriterContext() { @Override public Writer getWriter() { return WriterImpl.this; } }; } else { callbackContext = null; } this.adjustedStripeSize = stripeSize; this.defaultStripeSize = stripeSize; this.version = version; this.encodingStrategy = encodingStrategy; this.compressionStrategy = compressionStrategy; this.addBlockPadding = addBlockPadding; this.blockSize = blockSizeValue; this.paddingTolerance = paddingTolerance; this.compress = compress; this.rowIndexStride = rowIndexStride; this.memoryManager = memoryManager; buildIndex = rowIndexStride > 0; codec = createCodec(compress); this.bufferSize = getEstimatedBufferSize(bufferSize); treeWriter = createTreeWriter(inspector, streamFactory, false); if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { throw new IllegalArgumentException("Row stride must be at least " + MIN_ROW_INDEX_STRIDE); } // ensure that we are able to handle callbacks before we register ourselves memoryManager.addWriter(path, stripeSize, this); } int getEstimatedBufferSize(int bs) { String colNames = conf.get(IOConstants.COLUMNS); long availableMem = getMemoryAvailableForORC(); if (colNames != null) { final int numCols = colNames.split(",").length; if (numCols > COLUMN_COUNT_THRESHOLD) { // In BufferedStream, there are 3 outstream buffers (compressed, // uncompressed and overflow) and list of previously compressed buffers. // Since overflow buffer is rarely used, lets consider only 2 allocation. // Also, initially, the list of compression buffers will be empty. final int outStreamBuffers = codec == null ? 1 : 2; // max possible streams per column is 5. For string columns, there is // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams. final int maxStreams = 5; // Lets assume 10% memory for holding dictionary in memory and other // object allocations final long miscAllocation = (long) (0.1f * availableMem); // compute the available memory final long remainingMem = availableMem - miscAllocation; int estBufferSize = (int) (remainingMem / (maxStreams * outStreamBuffers * numCols)); estBufferSize = getClosestBufferSize(estBufferSize, bs); if (estBufferSize > bs) { estBufferSize = bs; } LOG.info("WIDE TABLE - Number of columns: " + numCols + " Chosen compression buffer size: " + estBufferSize); return estBufferSize; } } return bs; } private int getClosestBufferSize(int estBufferSize, int bs) { final int kb4 = 4 * 1024; final int kb8 = 8 * 1024; final int kb16 = 16 * 1024; final int kb32 = 32 * 1024; final int kb64 = 64 * 1024; final int kb128 = 128 * 1024; final int kb256 = 256 * 1024; if (estBufferSize <= kb4) { return kb4; } else if (estBufferSize > kb4 && estBufferSize <= kb8) { return kb8; } else if (estBufferSize > kb8 && estBufferSize <= kb16) { return kb16; } else if (estBufferSize > kb16 && estBufferSize <= kb32) { return kb32; } else if (estBufferSize > kb32 && estBufferSize <= kb64) { return kb64; } else if (estBufferSize > kb64 && estBufferSize <= kb128) { return kb128; } else { return kb256; } } // the assumption is only one ORC writer open at a time, which holds true for // most of the cases. HIVE-6455 forces single writer case. private long getMemoryAvailableForORC() { HiveConf.ConfVars poolVar = HiveConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL; double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal); long totalMemoryPool = Math .round(ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax() * maxLoad); return totalMemoryPool; } static CompressionCodec createCodec(CompressionKind kind) { switch (kind) { case NONE: return null; case ZLIB: return new ZlibCodec(); case SNAPPY: return new SnappyCodec(); case LZO: try { Class<? extends CompressionCodec> lzo = (Class<? extends CompressionCodec>) Class .forName("org.apache.hadoop.hive.ql.io.orc.LzoCodec"); return lzo.newInstance(); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("LZO is not available.", e); } catch (InstantiationException e) { throw new IllegalArgumentException("Problem initializing LZO", e); } catch (IllegalAccessException e) { throw new IllegalArgumentException("Insufficient access to LZO", e); } default: throw new IllegalArgumentException("Unknown compression codec: " + kind); } } @Override public synchronized boolean checkMemory(double newScale) throws IOException { long limit = (long) Math.round(adjustedStripeSize * newScale); long size = estimateStripeSize(); if (LOG.isDebugEnabled()) { LOG.debug("ORC writer " + path + " size = " + size + " limit = " + limit); } if (size > limit) { flushStripe(); return true; } return false; } /** * This class is used to hold the contents of streams as they are buffered. * The TreeWriters write to the outStream and the codec compresses the * data as buffers fill up and stores them in the output list. When the * stripe is being written, the whole stream is written to the file. */ private class BufferedStream implements OutStream.OutputReceiver { private final OutStream outStream; private final List<ByteBuffer> output = new ArrayList<ByteBuffer>(); BufferedStream(String name, int bufferSize, CompressionCodec codec) throws IOException { outStream = new OutStream(name, bufferSize, codec, this); } /** * Receive a buffer from the compression codec. * @param buffer the buffer to save * @throws IOException */ @Override public void output(ByteBuffer buffer) { output.add(buffer); } /** * Get the number of bytes in buffers that are allocated to this stream. * @return number of bytes in buffers */ public long getBufferSize() { long result = 0; for (ByteBuffer buf : output) { result += buf.capacity(); } return outStream.getBufferSize() + result; } /** * Flush the stream to the codec. * @throws IOException */ public void flush() throws IOException { outStream.flush(); } /** * Clear all of the buffers. * @throws IOException */ public void clear() throws IOException { outStream.clear(); output.clear(); } /** * Check the state of suppress flag in output stream * @return value of suppress flag */ public boolean isSuppressed() { return outStream.isSuppressed(); } /** * Get the number of bytes that will be written to the output. Assumes * the stream has already been flushed. * @return the number of bytes */ public long getOutputSize() { long result = 0; for (ByteBuffer buffer : output) { result += buffer.remaining(); } return result; } /** * Write the saved compressed buffers to the OutputStream. * @param out the stream to write to * @throws IOException */ void spillTo(OutputStream out) throws IOException { for (ByteBuffer buffer : output) { out.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); } } @Override public String toString() { return outStream.toString(); } } /** * An output receiver that writes the ByteBuffers to the output stream * as they are received. */ private class DirectStream implements OutStream.OutputReceiver { private final FSDataOutputStream output; DirectStream(FSDataOutputStream output) { this.output = output; } @Override public void output(ByteBuffer buffer) throws IOException { output.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); } } private static class RowIndexPositionRecorder implements PositionRecorder { private final OrcProto.RowIndexEntry.Builder builder; RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) { this.builder = builder; } @Override public void addPosition(long position) { builder.addPositions(position); } } /** * Interface from the Writer to the TreeWriters. This limits the visibility * that the TreeWriters have into the Writer. */ private class StreamFactory { /** * Create a stream to store part of a column. * @param column the column id for the stream * @param kind the kind of stream * @return The output outStream that the section needs to be written to. * @throws IOException */ public OutStream createStream(int column, OrcProto.Stream.Kind kind) throws IOException { final StreamName name = new StreamName(column, kind); final EnumSet<CompressionCodec.Modifier> modifiers; switch (kind) { case DATA: case DICTIONARY_DATA: if (getCompressionStrategy() == CompressionStrategy.SPEED) { modifiers = EnumSet.of(Modifier.FAST, Modifier.TEXT); } else { modifiers = EnumSet.of(Modifier.DEFAULT, Modifier.TEXT); } break; case LENGTH: case DICTIONARY_COUNT: case PRESENT: case ROW_INDEX: case SECONDARY: // easily compressed using the fastest modes modifiers = EnumSet.of(Modifier.FASTEST, Modifier.BINARY); break; default: LOG.warn("Missing ORC compression modifiers for " + kind); modifiers = null; break; } BufferedStream result = streams.get(name); if (result == null) { result = new BufferedStream(name.toString(), bufferSize, codec == null ? codec : codec.modify(modifiers)); streams.put(name, result); } return result.outStream; } /** * Get the next column id. * @return a number from 0 to the number of columns - 1 */ public int getNextColumnId() { return columnCount++; } /** * Get the stride rate of the row index. */ public int getRowIndexStride() { return rowIndexStride; } /** * Should be building the row index. * @return true if we are building the index */ public boolean buildIndex() { return buildIndex; } /** * Is the ORC file compressed? * @return are the streams compressed */ public boolean isCompressed() { return codec != null; } /** * Get the encoding strategy to use. * @return encoding strategy */ public EncodingStrategy getEncodingStrategy() { return encodingStrategy; } /** * Get the compression strategy to use. * @return compression strategy */ public CompressionStrategy getCompressionStrategy() { return compressionStrategy; } /** * Get the writer's configuration. * @return configuration */ public Configuration getConfiguration() { return conf; } /** * Get the version of the file to write. */ public OrcFile.Version getVersion() { return version; } } /** * The parent class of all of the writers for each column. Each column * is written by an instance of this class. The compound types (struct, * list, map, and union) have children tree writers that write the children * types. */ private abstract static class TreeWriter { protected final int id; protected final ObjectInspector inspector; private final BitFieldWriter isPresent; private final boolean isCompressed; protected final ColumnStatisticsImpl indexStatistics; protected final ColumnStatisticsImpl stripeColStatistics; private final ColumnStatisticsImpl fileStatistics; protected TreeWriter[] childrenWriters; protected final RowIndexPositionRecorder rowIndexPosition; private final OrcProto.RowIndex.Builder rowIndex; private final OrcProto.RowIndexEntry.Builder rowIndexEntry; private final PositionedOutputStream rowIndexStream; private boolean foundNulls; private OutStream isPresentOutStream; private final List<StripeStatistics.Builder> stripeStatsBuilders; /** * Create a tree writer. * @param columnId the column id of the column to write * @param inspector the object inspector to use * @param streamFactory limited access to the Writer's data. * @param nullable can the value be null? * @throws IOException */ TreeWriter(int columnId, ObjectInspector inspector, StreamFactory streamFactory, boolean nullable) throws IOException { this.isCompressed = streamFactory.isCompressed(); this.id = columnId; this.inspector = inspector; if (nullable) { isPresentOutStream = streamFactory.createStream(id, OrcProto.Stream.Kind.PRESENT); isPresent = new BitFieldWriter(isPresentOutStream, 1); } else { isPresent = null; } this.foundNulls = false; indexStatistics = ColumnStatisticsImpl.create(inspector); stripeColStatistics = ColumnStatisticsImpl.create(inspector); fileStatistics = ColumnStatisticsImpl.create(inspector); childrenWriters = new TreeWriter[0]; rowIndex = OrcProto.RowIndex.newBuilder(); rowIndexEntry = OrcProto.RowIndexEntry.newBuilder(); rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry); stripeStatsBuilders = Lists.newArrayList(); if (streamFactory.buildIndex()) { rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Kind.ROW_INDEX); } else { rowIndexStream = null; } } protected OrcProto.RowIndex.Builder getRowIndex() { return rowIndex; } protected ColumnStatisticsImpl getStripeStatistics() { return stripeColStatistics; } protected ColumnStatisticsImpl getFileStatistics() { return fileStatistics; } protected OrcProto.RowIndexEntry.Builder getRowIndexEntry() { return rowIndexEntry; } IntegerWriter createIntegerWriter(PositionedOutputStream output, boolean signed, boolean isDirectV2, StreamFactory writer) { if (isDirectV2) { boolean alignedBitpacking = false; if (writer.getEncodingStrategy().equals(EncodingStrategy.SPEED)) { alignedBitpacking = true; } return new RunLengthIntegerWriterV2(output, signed, alignedBitpacking); } else { return new RunLengthIntegerWriter(output, signed); } } boolean isNewWriteFormat(StreamFactory writer) { return writer.getVersion() != OrcFile.Version.V_0_11; } /** * Add a new value to the column. * @param obj * @throws IOException */ void write(Object obj) throws IOException { if (obj != null) { indexStatistics.increment(); } if (isPresent != null) { isPresent.write(obj == null ? 0 : 1); if (obj == null) { foundNulls = true; } } } private void removeIsPresentPositions() { for (int i = 0; i < rowIndex.getEntryCount(); ++i) { RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i); List<Long> positions = entry.getPositionsList(); // bit streams use 3 positions if uncompressed, 4 if compressed positions = positions.subList(isCompressed ? 4 : 3, positions.size()); entry.clearPositions(); entry.addAllPositions(positions); } } /** * Write the stripe out to the file. * @param builder the stripe footer that contains the information about the * layout of the stripe. The TreeWriter is required to update * the footer with its information. * @param requiredIndexEntries the number of index entries that are * required. this is to check to make sure the * row index is well formed. * @throws IOException */ void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { if (isPresent != null) { isPresent.flush(); // if no nulls are found in a stream, then suppress the stream if (!foundNulls) { isPresentOutStream.suppress(); // since isPresent bitstream is suppressed, update the index to // remove the positions of the isPresent stream if (rowIndexStream != null) { removeIsPresentPositions(); } } } // merge stripe-level column statistics to file statistics and write it to // stripe statistics OrcProto.StripeStatistics.Builder stripeStatsBuilder = OrcProto.StripeStatistics.newBuilder(); writeStripeStatistics(stripeStatsBuilder, this); stripeStatsBuilders.add(stripeStatsBuilder); // reset the flag for next stripe foundNulls = false; builder.addColumns(getEncoding()); if (rowIndexStream != null) { if (rowIndex.getEntryCount() != requiredIndexEntries) { throw new IllegalArgumentException("Column has wrong number of " + "index entries found: " + rowIndex.getEntryCount() + " expected: " + requiredIndexEntries); } rowIndex.build().writeTo(rowIndexStream); rowIndexStream.flush(); } rowIndex.clear(); rowIndexEntry.clear(); } private void writeStripeStatistics(OrcProto.StripeStatistics.Builder builder, TreeWriter treeWriter) { treeWriter.fileStatistics.merge(treeWriter.stripeColStatistics); builder.addColStats(treeWriter.stripeColStatistics.serialize().build()); treeWriter.stripeColStatistics.reset(); for (TreeWriter child : treeWriter.getChildrenWriters()) { writeStripeStatistics(builder, child); } } TreeWriter[] getChildrenWriters() { return childrenWriters; } /** * Get the encoding for this column. * @return the information about the encoding of this column */ OrcProto.ColumnEncoding getEncoding() { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } /** * Create a row index entry with the previous location and the current * index statistics. Also merges the index statistics into the file * statistics before they are cleared. Finally, it records the start of the * next index and ensures all of the children columns also create an entry. * @throws IOException */ void createRowIndexEntry() throws IOException { stripeColStatistics.merge(indexStatistics); rowIndexEntry.setStatistics(indexStatistics.serialize()); indexStatistics.reset(); rowIndex.addEntry(rowIndexEntry); rowIndexEntry.clear(); recordPosition(rowIndexPosition); for (TreeWriter child : childrenWriters) { child.createRowIndexEntry(); } } /** * Record the current position in each of this column's streams. * @param recorder where should the locations be recorded * @throws IOException */ void recordPosition(PositionRecorder recorder) throws IOException { if (isPresent != null) { isPresent.getPosition(recorder); } } /** * Estimate how much memory the writer is consuming excluding the streams. * @return the number of bytes. */ long estimateMemory() { long result = 0; for (TreeWriter child : childrenWriters) { result += child.estimateMemory(); } return result; } } private static class BooleanTreeWriter extends TreeWriter { private final BitFieldWriter writer; BooleanTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.writer = new BitFieldWriter(out, 1); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { boolean val = ((BooleanObjectInspector) inspector).get(obj); indexStatistics.updateBoolean(val); writer.write(val ? 1 : 0); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); writer.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); writer.getPosition(recorder); } } private static class ByteTreeWriter extends TreeWriter { private final RunLengthByteWriter writer; ByteTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.writer = new RunLengthByteWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA)); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { byte val = ((ByteObjectInspector) inspector).get(obj); indexStatistics.updateInteger(val); writer.write(val); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); writer.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); writer.getPosition(recorder); } } private static class IntegerTreeWriter extends TreeWriter { private final IntegerWriter writer; private final ShortObjectInspector shortInspector; private final IntObjectInspector intInspector; private final LongObjectInspector longInspector; private boolean isDirectV2 = true; IntegerTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); this.writer = createIntegerWriter(out, true, isDirectV2, writer); if (inspector instanceof IntObjectInspector) { intInspector = (IntObjectInspector) inspector; shortInspector = null; longInspector = null; } else { intInspector = null; if (inspector instanceof LongObjectInspector) { longInspector = (LongObjectInspector) inspector; shortInspector = null; } else { shortInspector = (ShortObjectInspector) inspector; longInspector = null; } } recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { long val; if (intInspector != null) { val = intInspector.get(obj); } else if (longInspector != null) { val = longInspector.get(obj); } else { val = shortInspector.get(obj); } indexStatistics.updateInteger(val); writer.write(val); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); writer.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); writer.getPosition(recorder); } } private static class FloatTreeWriter extends TreeWriter { private final PositionedOutputStream stream; private final SerializationUtils utils; FloatTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.utils = new SerializationUtils(); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { float val = ((FloatObjectInspector) inspector).get(obj); indexStatistics.updateDouble(val); utils.writeFloat(stream, val); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); stream.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); stream.getPosition(recorder); } } private static class DoubleTreeWriter extends TreeWriter { private final PositionedOutputStream stream; private final SerializationUtils utils; DoubleTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.utils = new SerializationUtils(); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { double val = ((DoubleObjectInspector) inspector).get(obj); indexStatistics.updateDouble(val); utils.writeDouble(stream, val); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); stream.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); stream.getPosition(recorder); } } private static class StringTreeWriter extends TreeWriter { private static final int INITIAL_DICTIONARY_SIZE = 4096; private final OutStream stringOutput; private final IntegerWriter lengthOutput; private final IntegerWriter rowOutput; private final StringRedBlackTree dictionary = new StringRedBlackTree(INITIAL_DICTIONARY_SIZE); private final DynamicIntArray rows = new DynamicIntArray(); private final PositionedOutputStream directStreamOutput; private final IntegerWriter directLengthOutput; private final List<OrcProto.RowIndexEntry> savedRowIndex = new ArrayList<OrcProto.RowIndexEntry>(); private final boolean buildIndex; private final List<Long> rowIndexValueCount = new ArrayList<Long>(); // If the number of keys in a dictionary is greater than this fraction of //the total number of non-null rows, turn off dictionary encoding private final float dictionaryKeySizeThreshold; private boolean useDictionaryEncoding = true; private boolean isDirectV2 = true; private boolean doneDictionaryCheck; private final boolean strideDictionaryCheck; StringTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); stringOutput = writer.createStream(id, OrcProto.Stream.Kind.DICTIONARY_DATA); lengthOutput = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); rowOutput = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA), false, isDirectV2, writer); recordPosition(rowIndexPosition); rowIndexValueCount.add(0L); buildIndex = writer.buildIndex(); directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA); directLengthOutput = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); dictionaryKeySizeThreshold = writer.getConfiguration().getFloat( HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, HiveConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal); strideDictionaryCheck = writer.getConfiguration().getBoolean( HiveConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, HiveConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.defaultBoolVal); doneDictionaryCheck = false; } /** * Method to retrieve text values from the value object, which can be overridden * by subclasses. * @param obj value * @return Text text value from obj */ Text getTextValue(Object obj) { return ((StringObjectInspector) inspector).getPrimitiveWritableObject(obj); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { Text val = getTextValue(obj); if (useDictionaryEncoding || !strideDictionaryCheck) { rows.add(dictionary.add(val)); } else { // write data and length directStreamOutput.write(val.getBytes(), 0, val.getLength()); directLengthOutput.write(val.getLength()); } indexStatistics.updateString(val); } } private boolean checkDictionaryEncoding() { if (!doneDictionaryCheck) { // Set the flag indicating whether or not to use dictionary encoding // based on whether or not the fraction of distinct keys over number of // non-null rows is less than the configured threshold float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f; useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold; doneDictionaryCheck = true; } return useDictionaryEncoding; } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { // if rows in stripe is less than dictionaryCheckAfterRows, dictionary // checking would not have happened. So do it again here. checkDictionaryEncoding(); if (useDictionaryEncoding) { flushDictionary(); } else { // flushout any left over entries from dictionary if (rows.size() > 0) { flushDictionary(); } // suppress the stream for every stripe if dictionary is disabled stringOutput.suppress(); } // we need to build the rowindex before calling super, since it // writes it out. super.writeStripe(builder, requiredIndexEntries); stringOutput.flush(); lengthOutput.flush(); rowOutput.flush(); directStreamOutput.flush(); directLengthOutput.flush(); // reset all of the fields to be ready for the next stripe. dictionary.clear(); savedRowIndex.clear(); rowIndexValueCount.clear(); recordPosition(rowIndexPosition); rowIndexValueCount.add(0L); if (!useDictionaryEncoding) { // record the start positions of first index stride of next stripe i.e // beginning of the direct streams when dictionary is disabled recordDirectStreamPosition(); } } private void flushDictionary() throws IOException { final int[] dumpOrder = new int[dictionary.size()]; if (useDictionaryEncoding) { // Write the dictionary by traversing the red-black tree writing out // the bytes and lengths; and creating the map from the original order // to the final sorted order. dictionary.visit(new StringRedBlackTree.Visitor() { private int currentId = 0; @Override public void visit(StringRedBlackTree.VisitorContext context) throws IOException { context.writeBytes(stringOutput); lengthOutput.write(context.getLength()); dumpOrder[context.getOriginalPosition()] = currentId++; } }); } else { // for direct encoding, we don't want the dictionary data stream stringOutput.suppress(); } int length = rows.size(); int rowIndexEntry = 0; OrcProto.RowIndex.Builder rowIndex = getRowIndex(); Text text = new Text(); // write the values translated into the dump order. for (int i = 0; i <= length; ++i) { // now that we are writing out the row values, we can finalize the // row index if (buildIndex) { while (i == rowIndexValueCount.get(rowIndexEntry) && rowIndexEntry < savedRowIndex.size()) { OrcProto.RowIndexEntry.Builder base = savedRowIndex.get(rowIndexEntry++).toBuilder(); if (useDictionaryEncoding) { rowOutput.getPosition(new RowIndexPositionRecorder(base)); } else { PositionRecorder posn = new RowIndexPositionRecorder(base); directStreamOutput.getPosition(posn); directLengthOutput.getPosition(posn); } rowIndex.addEntry(base.build()); } } if (i != length) { if (useDictionaryEncoding) { rowOutput.write(dumpOrder[rows.get(i)]); } else { dictionary.getText(text, rows.get(i)); directStreamOutput.write(text.getBytes(), 0, text.getLength()); directLengthOutput.write(text.getLength()); } } } rows.clear(); } @Override OrcProto.ColumnEncoding getEncoding() { // Returns the encoding used for the last call to writeStripe if (useDictionaryEncoding) { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) .setDictionarySize(dictionary.size()).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY) .setDictionarySize(dictionary.size()).build(); } else { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2) .build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } } /** * This method doesn't call the super method, because unlike most of the * other TreeWriters, this one can't record the position in the streams * until the stripe is being flushed. Therefore it saves all of the entries * and augments them with the final information as the stripe is written. * @throws IOException */ @Override void createRowIndexEntry() throws IOException { getStripeStatistics().merge(indexStatistics); OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry(); rowIndexEntry.setStatistics(indexStatistics.serialize()); indexStatistics.reset(); OrcProto.RowIndexEntry base = rowIndexEntry.build(); savedRowIndex.add(base); rowIndexEntry.clear(); recordPosition(rowIndexPosition); rowIndexValueCount.add(Long.valueOf(rows.size())); if (strideDictionaryCheck) { checkDictionaryEncoding(); } if (!useDictionaryEncoding) { if (rows.size() > 0) { flushDictionary(); // just record the start positions of next index stride recordDirectStreamPosition(); } else { // record the start positions of next index stride recordDirectStreamPosition(); getRowIndex().addEntry(base); } } } private void recordDirectStreamPosition() throws IOException { directStreamOutput.getPosition(rowIndexPosition); directLengthOutput.getPosition(rowIndexPosition); } @Override long estimateMemory() { return rows.getSizeInBytes() + dictionary.getSizeInBytes(); } } /** * Under the covers, char is written to ORC the same way as string. */ private static class CharTreeWriter extends StringTreeWriter { CharTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); } /** * Override base class implementation to support char values. */ @Override Text getTextValue(Object obj) { return (((HiveCharObjectInspector) inspector).getPrimitiveWritableObject(obj)).getTextValue(); } } /** * Under the covers, varchar is written to ORC the same way as string. */ private static class VarcharTreeWriter extends StringTreeWriter { VarcharTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); } /** * Override base class implementation to support varchar values. */ @Override Text getTextValue(Object obj) { return (((HiveVarcharObjectInspector) inspector).getPrimitiveWritableObject(obj)).getTextValue(); } } private static class BinaryTreeWriter extends TreeWriter { private final PositionedOutputStream stream; private final IntegerWriter length; private boolean isDirectV2 = true; BinaryTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.stream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); this.length = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { BytesWritable val = ((BinaryObjectInspector) inspector).getPrimitiveWritableObject(obj); stream.write(val.getBytes(), 0, val.getLength()); length.write(val.getLength()); indexStatistics.updateBinary(val); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); stream.flush(); length.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); stream.getPosition(recorder); length.getPosition(recorder); } } static final int MILLIS_PER_SECOND = 1000; static final long BASE_TIMESTAMP = Timestamp.valueOf("2015-01-01 00:00:00").getTime() / MILLIS_PER_SECOND; private static class TimestampTreeWriter extends TreeWriter { private final IntegerWriter seconds; private final IntegerWriter nanos; private final boolean isDirectV2; TimestampTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); this.seconds = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.DATA), true, isDirectV2, writer); this.nanos = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { Timestamp val = ((TimestampObjectInspector) inspector).getPrimitiveJavaObject(obj); indexStatistics.updateTimestamp(val); seconds.write((val.getTime() / MILLIS_PER_SECOND) - BASE_TIMESTAMP); nanos.write(formatNanos(val.getNanos())); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); seconds.flush(); nanos.flush(); recordPosition(rowIndexPosition); } private static long formatNanos(int nanos) { if (nanos == 0) { return 0; } else if (nanos % 100 != 0) { return ((long) nanos) << 3; } else { nanos /= 100; int trailingZeros = 1; while (nanos % 10 == 0 && trailingZeros < 7) { nanos /= 10; trailingZeros += 1; } return ((long) nanos) << 3 | trailingZeros; } } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); seconds.getPosition(recorder); nanos.getPosition(recorder); } } private static class DateTreeWriter extends TreeWriter { private final IntegerWriter writer; private final boolean isDirectV2; DateTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); PositionedOutputStream out = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); this.writer = createIntegerWriter(out, true, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { // Using the Writable here as it's used directly for writing as well as for stats. DateWritable val = ((DateObjectInspector) inspector).getPrimitiveWritableObject(obj); indexStatistics.updateDate(val); writer.write(val.getDays()); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); writer.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); writer.getPosition(recorder); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } } private static class DecimalTreeWriter extends TreeWriter { private final PositionedOutputStream valueStream; private final IntegerWriter scaleStream; private final boolean isDirectV2; DecimalTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA); this.scaleStream = createIntegerWriter(writer.createStream(id, OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { HiveDecimal decimal = ((HiveDecimalObjectInspector) inspector).getPrimitiveJavaObject(obj); if (decimal == null) { return; } SerializationUtils.writeBigInteger(valueStream, decimal.unscaledValue()); scaleStream.write(decimal.scale()); indexStatistics.updateDecimal(decimal); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); valueStream.flush(); scaleStream.flush(); recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); valueStream.getPosition(recorder); scaleStream.getPosition(recorder); } } private static class StructTreeWriter extends TreeWriter { private final List<? extends StructField> fields; StructTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); StructObjectInspector structObjectInspector = (StructObjectInspector) inspector; fields = structObjectInspector.getAllStructFieldRefs(); childrenWriters = new TreeWriter[fields.size()]; for (int i = 0; i < childrenWriters.length; ++i) { childrenWriters[i] = createTreeWriter(fields.get(i).getFieldObjectInspector(), writer, true); } recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { StructObjectInspector insp = (StructObjectInspector) inspector; for (int i = 0; i < fields.size(); ++i) { StructField field = fields.get(i); TreeWriter writer = childrenWriters[i]; writer.write(insp.getStructFieldData(obj, field)); } } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); for (TreeWriter child : childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); } } private static class ListTreeWriter extends TreeWriter { private final IntegerWriter lengths; private final boolean isDirectV2; ListTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); ListObjectInspector listObjectInspector = (ListObjectInspector) inspector; childrenWriters = new TreeWriter[1]; childrenWriters[0] = createTreeWriter(listObjectInspector.getListElementObjectInspector(), writer, true); lengths = createIntegerWriter(writer.createStream(columnId, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { ListObjectInspector insp = (ListObjectInspector) inspector; int len = insp.getListLength(obj); lengths.write(len); for (int i = 0; i < len; ++i) { childrenWriters[0].write(insp.getListElement(obj, i)); } } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); lengths.flush(); for (TreeWriter child : childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); lengths.getPosition(recorder); } } private static class MapTreeWriter extends TreeWriter { private final IntegerWriter lengths; private final boolean isDirectV2; MapTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); this.isDirectV2 = isNewWriteFormat(writer); MapObjectInspector insp = (MapObjectInspector) inspector; childrenWriters = new TreeWriter[2]; childrenWriters[0] = createTreeWriter(insp.getMapKeyObjectInspector(), writer, true); childrenWriters[1] = createTreeWriter(insp.getMapValueObjectInspector(), writer, true); lengths = createIntegerWriter(writer.createStream(columnId, OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); recordPosition(rowIndexPosition); } @Override OrcProto.ColumnEncoding getEncoding() { if (isDirectV2) { return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); } return OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { MapObjectInspector insp = (MapObjectInspector) inspector; // this sucks, but it will have to do until we can get a better // accessor in the MapObjectInspector. Map<?, ?> valueMap = insp.getMap(obj); lengths.write(valueMap.size()); for (Map.Entry<?, ?> entry : valueMap.entrySet()) { childrenWriters[0].write(entry.getKey()); childrenWriters[1].write(entry.getValue()); } } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); lengths.flush(); for (TreeWriter child : childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); lengths.getPosition(recorder); } } private static class UnionTreeWriter extends TreeWriter { private final RunLengthByteWriter tags; UnionTreeWriter(int columnId, ObjectInspector inspector, StreamFactory writer, boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); UnionObjectInspector insp = (UnionObjectInspector) inspector; List<ObjectInspector> choices = insp.getObjectInspectors(); childrenWriters = new TreeWriter[choices.size()]; for (int i = 0; i < childrenWriters.length; ++i) { childrenWriters[i] = createTreeWriter(choices.get(i), writer, true); } tags = new RunLengthByteWriter(writer.createStream(columnId, OrcProto.Stream.Kind.DATA)); recordPosition(rowIndexPosition); } @Override void write(Object obj) throws IOException { super.write(obj); if (obj != null) { UnionObjectInspector insp = (UnionObjectInspector) inspector; byte tag = insp.getTag(obj); tags.write(tag); childrenWriters[tag].write(insp.getField(obj)); } } @Override void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); tags.flush(); for (TreeWriter child : childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); } @Override void recordPosition(PositionRecorder recorder) throws IOException { super.recordPosition(recorder); tags.getPosition(recorder); } } private static TreeWriter createTreeWriter(ObjectInspector inspector, StreamFactory streamFactory, boolean nullable) throws IOException { switch (inspector.getCategory()) { case PRIMITIVE: switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { case BOOLEAN: return new BooleanTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case BYTE: return new ByteTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case SHORT: case INT: case LONG: return new IntegerTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case FLOAT: return new FloatTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case DOUBLE: return new DoubleTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case STRING: return new StringTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case CHAR: return new CharTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case VARCHAR: return new VarcharTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case BINARY: return new BinaryTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case TIMESTAMP: return new TimestampTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case DATE: return new DateTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case DECIMAL: return new DecimalTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); default: throw new IllegalArgumentException( "Bad primitive category " + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); } case STRUCT: return new StructTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case MAP: return new MapTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case LIST: return new ListTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case UNION: return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); default: throw new IllegalArgumentException("Bad category: " + inspector.getCategory()); } } private static void writeTypes(OrcProto.Footer.Builder builder, TreeWriter treeWriter) { OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); switch (treeWriter.inspector.getCategory()) { case PRIMITIVE: switch (((PrimitiveObjectInspector) treeWriter.inspector).getPrimitiveCategory()) { case BOOLEAN: type.setKind(OrcProto.Type.Kind.BOOLEAN); break; case BYTE: type.setKind(OrcProto.Type.Kind.BYTE); break; case SHORT: type.setKind(OrcProto.Type.Kind.SHORT); break; case INT: type.setKind(OrcProto.Type.Kind.INT); break; case LONG: type.setKind(OrcProto.Type.Kind.LONG); break; case FLOAT: type.setKind(OrcProto.Type.Kind.FLOAT); break; case DOUBLE: type.setKind(OrcProto.Type.Kind.DOUBLE); break; case STRING: type.setKind(OrcProto.Type.Kind.STRING); break; case CHAR: // The char length needs to be written to file and should be available // from the object inspector CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector) .getTypeInfo(); type.setKind(Type.Kind.CHAR); type.setMaximumLength(charTypeInfo.getLength()); break; case VARCHAR: // The varchar length needs to be written to file and should be available // from the object inspector VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector) .getTypeInfo(); type.setKind(Type.Kind.VARCHAR); type.setMaximumLength(typeInfo.getLength()); break; case BINARY: type.setKind(OrcProto.Type.Kind.BINARY); break; case TIMESTAMP: type.setKind(OrcProto.Type.Kind.TIMESTAMP); break; case DATE: type.setKind(OrcProto.Type.Kind.DATE); break; case DECIMAL: DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector) .getTypeInfo(); type.setKind(OrcProto.Type.Kind.DECIMAL); type.setPrecision(decTypeInfo.precision()); type.setScale(decTypeInfo.scale()); break; default: throw new IllegalArgumentException("Unknown primitive category: " + ((PrimitiveObjectInspector) treeWriter.inspector).getPrimitiveCategory()); } break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); type.addSubtypes(treeWriter.childrenWriters[0].id); break; case MAP: type.setKind(OrcProto.Type.Kind.MAP); type.addSubtypes(treeWriter.childrenWriters[0].id); type.addSubtypes(treeWriter.childrenWriters[1].id); break; case STRUCT: type.setKind(OrcProto.Type.Kind.STRUCT); for (TreeWriter child : treeWriter.childrenWriters) { type.addSubtypes(child.id); } for (StructField field : ((StructTreeWriter) treeWriter).fields) { type.addFieldNames(field.getFieldName()); } break; case UNION: type.setKind(OrcProto.Type.Kind.UNION); for (TreeWriter child : treeWriter.childrenWriters) { type.addSubtypes(child.id); } break; default: throw new IllegalArgumentException("Unknown category: " + treeWriter.inspector.getCategory()); } builder.addTypes(type); for (TreeWriter child : treeWriter.childrenWriters) { writeTypes(builder, child); } } @VisibleForTesting FSDataOutputStream getStream() throws IOException { if (rawWriter == null) { rawWriter = fs.create(path, false, HDFS_BUFFER_SIZE, fs.getDefaultReplication(), blockSize); rawWriter.writeBytes(OrcFile.MAGIC); headerLength = rawWriter.getPos(); writer = new OutStream("metadata", bufferSize, codec, new DirectStream(rawWriter)); protobufWriter = CodedOutputStream.newInstance(writer); } return rawWriter; } private void createRowIndexEntry() throws IOException { treeWriter.createRowIndexEntry(); rowsInIndex = 0; } private void flushStripe() throws IOException { getStream(); if (buildIndex && rowsInIndex != 0) { createRowIndexEntry(); } if (rowsInStripe != 0) { if (callback != null) { callback.preStripeWrite(callbackContext); } // finalize the data for the stripe int requiredIndexEntries = rowIndexStride == 0 ? 0 : (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); OrcProto.StripeFooter.Builder builder = OrcProto.StripeFooter.newBuilder(); treeWriter.writeStripe(builder, requiredIndexEntries); long indexSize = 0; long dataSize = 0; for (Map.Entry<StreamName, BufferedStream> pair : streams.entrySet()) { BufferedStream stream = pair.getValue(); if (!stream.isSuppressed()) { stream.flush(); StreamName name = pair.getKey(); long streamSize = pair.getValue().getOutputSize(); builder.addStreams(OrcProto.Stream.newBuilder().setColumn(name.getColumn()) .setKind(name.getKind()).setLength(streamSize)); if (StreamName.Area.INDEX == name.getArea()) { indexSize += streamSize; } else { dataSize += streamSize; } } } OrcProto.StripeFooter footer = builder.build(); // Do we need to pad the file so the stripe doesn't straddle a block // boundary? long start = rawWriter.getPos(); final long currentStripeSize = indexSize + dataSize + footer.getSerializedSize(); final long available = blockSize - (start % blockSize); final long overflow = currentStripeSize - adjustedStripeSize; final float availRatio = (float) available / (float) defaultStripeSize; if (availRatio > 0.0f && availRatio < 1.0f && availRatio > paddingTolerance) { // adjust default stripe size to fit into remaining space, also adjust // the next stripe for correction based on the current stripe size // and user specified padding tolerance. Since stripe size can overflow // the default stripe size we should apply this correction to avoid // writing portion of last stripe to next hdfs block. float correction = overflow > 0 ? (float) overflow / (float) adjustedStripeSize : 0.0f; // correction should not be greater than user specified padding // tolerance correction = correction > paddingTolerance ? paddingTolerance : correction; // adjust next stripe size based on current stripe estimate correction adjustedStripeSize = (long) ((1.0f - correction) * (availRatio * defaultStripeSize)); } else if (availRatio >= 1.0) { adjustedStripeSize = defaultStripeSize; } if (availRatio < paddingTolerance && addBlockPadding) { long padding = blockSize - (start % blockSize); byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, padding)]; LOG.info(String.format("Padding ORC by %d bytes (<= %.2f * %d)", padding, availRatio, defaultStripeSize)); start += padding; while (padding > 0) { int writeLen = (int) Math.min(padding, pad.length); rawWriter.write(pad, 0, writeLen); padding -= writeLen; } adjustedStripeSize = defaultStripeSize; } else if (currentStripeSize < blockSize && (start % blockSize) + currentStripeSize > blockSize) { // even if you don't pad, reset the default stripe size when crossing a // block boundary adjustedStripeSize = defaultStripeSize; } // write out the data streams for (Map.Entry<StreamName, BufferedStream> pair : streams.entrySet()) { BufferedStream stream = pair.getValue(); if (!stream.isSuppressed()) { stream.spillTo(rawWriter); } stream.clear(); } footer.writeTo(protobufWriter); protobufWriter.flush(); writer.flush(); long footerLength = rawWriter.getPos() - start - dataSize - indexSize; OrcProto.StripeInformation dirEntry = OrcProto.StripeInformation.newBuilder().setOffset(start) .setNumberOfRows(rowsInStripe).setIndexLength(indexSize).setDataLength(dataSize) .setFooterLength(footerLength).build(); stripes.add(dirEntry); rowCount += rowsInStripe; rowsInStripe = 0; } } private long computeRawDataSize() { long result = 0; for (TreeWriter child : treeWriter.getChildrenWriters()) { result += getRawDataSizeFromInspectors(child, child.inspector); } return result; } private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) { long total = 0; switch (oi.getCategory()) { case PRIMITIVE: total += getRawDataSizeFromPrimitives(child, oi); break; case LIST: case MAP: case UNION: case STRUCT: for (TreeWriter tw : child.childrenWriters) { total += getRawDataSizeFromInspectors(tw, tw.inspector); } break; default: LOG.debug("Unknown object inspector category."); break; } return total; } private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) { long result = 0; long numVals = child.fileStatistics.getNumberOfValues(); switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) { case BOOLEAN: case BYTE: case SHORT: case INT: case FLOAT: return numVals * JavaDataModel.get().primitive1(); case LONG: case DOUBLE: return numVals * JavaDataModel.get().primitive2(); case STRING: case VARCHAR: case CHAR: // ORC strings are converted to java Strings. so use JavaDataModel to // compute the overall size of strings child = (StringTreeWriter) child; StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics; numVals = numVals == 0 ? 1 : numVals; int avgStringLen = (int) (scs.getSum() / numVals); return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen); case DECIMAL: return numVals * JavaDataModel.get().lengthOfDecimal(); case DATE: return numVals * JavaDataModel.get().lengthOfDate(); case BINARY: // get total length of binary blob BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics; return bcs.getSum(); case TIMESTAMP: return numVals * JavaDataModel.get().lengthOfTimestamp(); default: LOG.debug("Unknown primitive category."); break; } return result; } private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) { switch (kind) { case NONE: return OrcProto.CompressionKind.NONE; case ZLIB: return OrcProto.CompressionKind.ZLIB; case SNAPPY: return OrcProto.CompressionKind.SNAPPY; case LZO: return OrcProto.CompressionKind.LZO; default: throw new IllegalArgumentException("Unknown compression " + kind); } } private void writeFileStatistics(OrcProto.Footer.Builder builder, TreeWriter writer) throws IOException { builder.addStatistics(writer.fileStatistics.serialize()); for (TreeWriter child : writer.getChildrenWriters()) { writeFileStatistics(builder, child); } } private int writeMetadata(long bodyLength) throws IOException { getStream(); OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder(); for (OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) { builder.addStripeStats(ssb.build()); } long startPosn = rawWriter.getPos(); OrcProto.Metadata metadata = builder.build(); metadata.writeTo(protobufWriter); protobufWriter.flush(); writer.flush(); return (int) (rawWriter.getPos() - startPosn); } private int writeFooter(long bodyLength) throws IOException { getStream(); OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder(); builder.setContentLength(bodyLength); builder.setHeaderLength(headerLength); builder.setNumberOfRows(rowCount); builder.setRowIndexStride(rowIndexStride); // populate raw data size rawDataSize = computeRawDataSize(); // serialize the types writeTypes(builder, treeWriter); // add the stripe information for (OrcProto.StripeInformation stripe : stripes) { builder.addStripes(stripe); } // add the column statistics writeFileStatistics(builder, treeWriter); // add all of the user metadata for (Map.Entry<String, ByteString> entry : userMetadata.entrySet()) { builder.addMetadata( OrcProto.UserMetadataItem.newBuilder().setName(entry.getKey()).setValue(entry.getValue())); } long startPosn = rawWriter.getPos(); OrcProto.Footer footer = builder.build(); footer.writeTo(protobufWriter); protobufWriter.flush(); writer.flush(); return (int) (rawWriter.getPos() - startPosn); } private int writePostScript(int footerLength, int metadataLength) throws IOException { OrcProto.PostScript.Builder builder = OrcProto.PostScript.newBuilder() .setCompression(writeCompressionKind(compress)).setFooterLength(footerLength) .setMetadataLength(metadataLength).setMagic(OrcFile.MAGIC).addVersion(version.getMajor()) .addVersion(version.getMinor()).setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId()); if (compress != CompressionKind.NONE) { builder.setCompressionBlockSize(bufferSize); } OrcProto.PostScript ps = builder.build(); // need to write this uncompressed long startPosn = rawWriter.getPos(); ps.writeTo(rawWriter); long length = rawWriter.getPos() - startPosn; if (length > 255) { throw new IllegalArgumentException("PostScript too large at " + length); } return (int) length; } private long estimateStripeSize() { long result = 0; for (BufferedStream stream : streams.values()) { result += stream.getBufferSize(); } result += treeWriter.estimateMemory(); return result; } @Override public synchronized void addUserMetadata(String name, ByteBuffer value) { userMetadata.put(name, ByteString.copyFrom(value)); } @Override public void addRow(Object row) throws IOException { synchronized (this) { treeWriter.write(row); rowsInStripe += 1; if (buildIndex) { rowsInIndex += 1; if (rowsInIndex >= rowIndexStride) { createRowIndexEntry(); } } } memoryManager.addedRow(); } @Override public void close() throws IOException { if (callback != null) { callback.preFooterWrite(callbackContext); } // remove us from the memory manager so that we don't get any callbacks memoryManager.removeWriter(path); // actually close the file synchronized (this) { flushStripe(); int metadataLength = writeMetadata(rawWriter.getPos()); int footerLength = writeFooter(rawWriter.getPos() - metadataLength); rawWriter.writeByte(writePostScript(footerLength, metadataLength)); rawWriter.close(); } } /** * Raw data size will be compute when writing the file footer. Hence raw data * size value will be available only after closing the writer. */ @Override public long getRawDataSize() { return rawDataSize; } /** * Row count gets updated when flushing the stripes. To get accurate row * count call this method after writer is closed. */ @Override public long getNumberOfRows() { return rowCount; } @Override public synchronized long writeIntermediateFooter() throws IOException { // flush any buffered rows flushStripe(); // write a footer if (stripesAtLastFlush != stripes.size()) { if (callback != null) { callback.preFooterWrite(callbackContext); } int metaLength = writeMetadata(rawWriter.getPos()); int footLength = writeFooter(rawWriter.getPos() - metaLength); rawWriter.writeByte(writePostScript(footLength, metaLength)); stripesAtLastFlush = stripes.size(); OrcInputFormat.SHIMS.hflush(rawWriter); } return rawWriter.getPos(); } @Override public void appendStripe(byte[] stripe, int offset, int length, StripeInformation stripeInfo, OrcProto.StripeStatistics stripeStatistics) throws IOException { checkArgument(stripe != null, "Stripe must not be null"); checkArgument(length <= stripe.length, "Specified length must not be greater specified array length"); checkArgument(stripeInfo != null, "Stripe information must not be null"); checkArgument(stripeStatistics != null, "Stripe statistics must not be null"); getStream(); long start = rawWriter.getPos(); long stripeLen = length; long availBlockSpace = blockSize - (start % blockSize); // see if stripe can fit in the current hdfs block, else pad the remaining // space in the block if (stripeLen < blockSize && stripeLen > availBlockSpace && addBlockPadding) { byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)]; LOG.info(String.format("Padding ORC by %d bytes while merging..", availBlockSpace)); start += availBlockSpace; while (availBlockSpace > 0) { int writeLen = (int) Math.min(availBlockSpace, pad.length); rawWriter.write(pad, 0, writeLen); availBlockSpace -= writeLen; } } rawWriter.write(stripe); rowsInStripe = stripeStatistics.getColStats(0).getNumberOfValues(); rowCount += rowsInStripe; // since we have already written the stripe, just update stripe statistics treeWriter.stripeStatsBuilders.add(stripeStatistics.toBuilder()); // update file level statistics updateFileStatistics(stripeStatistics); // update stripe information OrcProto.StripeInformation dirEntry = OrcProto.StripeInformation.newBuilder().setOffset(start) .setNumberOfRows(rowsInStripe).setIndexLength(stripeInfo.getIndexLength()) .setDataLength(stripeInfo.getDataLength()).setFooterLength(stripeInfo.getFooterLength()).build(); stripes.add(dirEntry); // reset it after writing the stripe rowsInStripe = 0; } private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) { List<OrcProto.ColumnStatistics> cs = stripeStatistics.getColStatsList(); // root element treeWriter.fileStatistics.merge(ColumnStatisticsImpl.deserialize(cs.get(0))); TreeWriter[] childWriters = treeWriter.getChildrenWriters(); for (int i = 0; i < childWriters.length; i++) { childWriters[i].fileStatistics.merge(ColumnStatisticsImpl.deserialize(cs.get(i + 1))); } } @Override public void appendUserMetadata(List<UserMetadataItem> userMetadata) { if (userMetadata != null) { for (UserMetadataItem item : userMetadata) { this.userMetadata.put(item.getName(), item.getValue()); } } } }