Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.prestosql.orc.checkpoint; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSetMultimap; import com.google.common.collect.Iterables; import com.google.common.collect.SetMultimap; import io.prestosql.orc.StreamId; import io.prestosql.orc.metadata.ColumnEncoding; import io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind; import io.prestosql.orc.metadata.OrcType; import io.prestosql.orc.metadata.OrcType.OrcTypeKind; import io.prestosql.orc.metadata.RowGroupIndex; import io.prestosql.orc.metadata.Stream; import io.prestosql.orc.metadata.Stream.StreamKind; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import static com.google.common.base.Predicates.equalTo; import static io.prestosql.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT_V2; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DWRF_DIRECT; import static io.prestosql.orc.metadata.Stream.StreamKind.DATA; import static io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_DATA; import static io.prestosql.orc.metadata.Stream.StreamKind.IN_DICTIONARY; import static io.prestosql.orc.metadata.Stream.StreamKind.IN_MAP; import static io.prestosql.orc.metadata.Stream.StreamKind.LENGTH; import static io.prestosql.orc.metadata.Stream.StreamKind.PRESENT; import static io.prestosql.orc.metadata.Stream.StreamKind.ROW_GROUP_DICTIONARY; import static io.prestosql.orc.metadata.Stream.StreamKind.ROW_GROUP_DICTIONARY_LENGTH; import static io.prestosql.orc.metadata.Stream.StreamKind.SECONDARY; import static java.lang.String.format; import static java.util.Objects.requireNonNull; public final class Checkpoints { private Checkpoints() { } public static Map<StreamId, StreamCheckpoint> getStreamCheckpoints(Set<Integer> columns, List<OrcType> columnTypes, boolean compressed, int rowGroupId, List<ColumnEncoding> columnEncodings, Map<StreamId, Stream> streams, Map<StreamId, List<RowGroupIndex>> columnIndexes) throws InvalidCheckpointException { // map from (column, sequence) to available StreamKind ImmutableSetMultimap.Builder<ColumnAndSequence, StreamKind> streamKindsBuilder = ImmutableSetMultimap .builder(); for (Stream stream : streams.values()) { streamKindsBuilder.put(new ColumnAndSequence(stream.getColumn(), stream.getSequence()), stream.getStreamKind()); } SetMultimap<ColumnAndSequence, StreamKind> streamKinds = streamKindsBuilder.build(); ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); for (Map.Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) { int column = entry.getKey().getColumn(); if (!columns.contains(column)) { continue; } int sequence = entry.getKey().getSequence(); List<Integer> positionsList = entry.getValue().get(rowGroupId).getPositions(); ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncoding(sequence) .getColumnEncodingKind(); OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); Set<StreamKind> availableStreams = streamKinds.get(new ColumnAndSequence(column, sequence)); ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, sequence, columnType, positionsList); switch (columnType) { case BOOLEAN: checkpoints.putAll(getBooleanColumnCheckpoints(column, sequence, compressed, availableStreams, columnPositionsList)); break; case BYTE: checkpoints.putAll(getByteColumnCheckpoints(column, sequence, compressed, availableStreams, columnPositionsList)); break; case SHORT: case INT: case LONG: case DATE: checkpoints.putAll(getLongColumnCheckpoints(column, sequence, columnEncoding, compressed, availableStreams, columnPositionsList)); break; case FLOAT: checkpoints.putAll(getFloatColumnCheckpoints(column, sequence, compressed, availableStreams, columnPositionsList)); break; case DOUBLE: checkpoints.putAll(getDoubleColumnCheckpoints(column, sequence, compressed, availableStreams, columnPositionsList)); break; case TIMESTAMP: checkpoints.putAll(getTimestampColumnCheckpoints(column, sequence, columnEncoding, compressed, availableStreams, columnPositionsList)); break; case BINARY: case STRING: case VARCHAR: case CHAR: checkpoints.putAll(getSliceColumnCheckpoints(column, sequence, columnEncoding, compressed, availableStreams, columnPositionsList)); break; case LIST: case MAP: checkpoints.putAll(getListOrMapColumnCheckpoints(column, sequence, columnEncoding, compressed, availableStreams, columnPositionsList)); break; case STRUCT: checkpoints.putAll(getStructColumnCheckpoints(column, sequence, compressed, availableStreams, columnPositionsList)); break; case DECIMAL: checkpoints.putAll(getDecimalColumnCheckpoints(column, sequence, columnEncoding, compressed, availableStreams, columnPositionsList)); break; default: throw new IllegalArgumentException("Unsupported column type " + columnType); } // The DWRF code is not meticulous in the handling of checkpoints. It appears that for the first row group // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. // We detect this case by checking that all offsets in the initial position list are zero, and if so, we // clear the extra offsets if (columnPositionsList.hasNextPosition() && !Iterables.all(positionsList, equalTo(0))) { throw new InvalidCheckpointException(format( "Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", column, columnType, positionsList.size(), columnPositionsList.getIndex())); } } return checkpoints.build(); } public static StreamCheckpoint getDictionaryStreamCheckpoint(StreamId streamId, OrcTypeKind columnType, ColumnEncodingKind columnEncoding) { if (streamId.getStreamKind() == DICTIONARY_DATA) { switch (columnType) { case SHORT: case INT: case LONG: return new LongStreamDwrfCheckpoint(createInputStreamCheckpoint(0, 0)); case STRING: case VARCHAR: case CHAR: case BINARY: return new ByteArrayStreamCheckpoint(createInputStreamCheckpoint(0, 0)); } } // dictionary length and data streams are unsigned long streams if (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA) { if (columnEncoding == DICTIONARY_V2) { return new LongStreamV2Checkpoint(0, createInputStreamCheckpoint(0, 0)); } else if (columnEncoding == DICTIONARY) { return new LongStreamV1Checkpoint(0, createInputStreamCheckpoint(0, 0)); } } throw new IllegalArgumentException( "Unsupported column type " + columnType + " for dictionary stream " + streamId); } private static Map<StreamId, StreamCheckpoint> getBooleanColumnCheckpoints(int column, int sequence, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new BooleanStreamCheckpoint(compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getByteColumnCheckpoints(int column, int sequence, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new ByteStreamCheckpoint(compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getLongColumnCheckpoints(int column, int sequence, ColumnEncodingKind encoding, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(IN_DICTIONARY)) { checkpoints.put(new StreamId(column, sequence, IN_DICTIONARY), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), createLongStreamCheckpoint(encoding, compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getFloatColumnCheckpoints(int column, int sequence, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new FloatStreamCheckpoint(compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getDoubleColumnCheckpoints(int column, int sequence, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new DoubleStreamCheckpoint(compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getTimestampColumnCheckpoints(int column, int sequence, ColumnEncodingKind encoding, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), createLongStreamCheckpoint(encoding, compressed, positionsList)); } if (availableStreams.contains(SECONDARY)) { checkpoints.put(new StreamId(column, sequence, SECONDARY), createLongStreamCheckpoint(encoding, compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getSliceColumnCheckpoints(int column, int sequence, ColumnEncodingKind encoding, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (encoding == DIRECT || encoding == DIRECT_V2) { if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new ByteArrayStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(LENGTH)) { checkpoints.put(new StreamId(column, sequence, LENGTH), createLongStreamCheckpoint(encoding, compressed, positionsList)); } } else if (encoding == DICTIONARY || encoding == DICTIONARY_V2) { // DWRF has rules inconsistent with the ORC style if (availableStreams.contains(IN_DICTIONARY)) { if (availableStreams.contains(ROW_GROUP_DICTIONARY)) { checkpoints.put(new StreamId(column, sequence, ROW_GROUP_DICTIONARY), new ByteArrayStreamCheckpoint(compressed, positionsList)); } checkpoints.put(new StreamId(column, sequence, ROW_GROUP_DICTIONARY_LENGTH), new RowGroupDictionaryLengthStreamCheckpoint(compressed, positionsList)); if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), createLongStreamCheckpoint(encoding, compressed, positionsList)); } checkpoints.put(new StreamId(column, sequence, IN_DICTIONARY), new BooleanStreamCheckpoint(compressed, positionsList)); } else { if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), createLongStreamCheckpoint(encoding, compressed, positionsList)); } } } else { throw new IllegalArgumentException("Unsupported encoding for slice column: " + encoding); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getListOrMapColumnCheckpoints(int column, int sequence, ColumnEncodingKind encoding, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(LENGTH)) { checkpoints.put(new StreamId(column, sequence, LENGTH), createLongStreamCheckpoint(encoding, compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getStructColumnCheckpoints(int column, int sequence, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } return checkpoints.build(); } private static Map<StreamId, StreamCheckpoint> getDecimalColumnCheckpoints(int column, int sequence, ColumnEncodingKind encoding, boolean compressed, Set<StreamKind> availableStreams, ColumnPositionsList positionsList) { ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder(); if (availableStreams.contains(IN_MAP)) { checkpoints.put(new StreamId(column, sequence, IN_MAP), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(PRESENT)) { checkpoints.put(new StreamId(column, sequence, PRESENT), new BooleanStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(DATA)) { checkpoints.put(new StreamId(column, sequence, DATA), new DecimalStreamCheckpoint(compressed, positionsList)); } if (availableStreams.contains(SECONDARY)) { checkpoints.put(new StreamId(column, sequence, SECONDARY), createLongStreamCheckpoint(encoding, compressed, positionsList)); } return checkpoints.build(); } private static StreamCheckpoint createLongStreamCheckpoint(ColumnEncodingKind encoding, boolean compressed, ColumnPositionsList positionsList) { if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { return new LongStreamV2Checkpoint(compressed, positionsList); } if (encoding == DIRECT || encoding == DICTIONARY) { return new LongStreamV1Checkpoint(compressed, positionsList); } if (encoding == DWRF_DIRECT) { return new LongStreamDwrfCheckpoint(compressed, positionsList); } throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); } public static class ColumnPositionsList { private final int column; private final int sequence; private final OrcTypeKind columnType; private final List<Integer> positionsList; private int index; private ColumnPositionsList(int column, int sequence, OrcTypeKind columnType, List<Integer> positionsList) { this.column = column; this.sequence = sequence; this.columnType = requireNonNull(columnType, "columnType is null"); this.positionsList = ImmutableList.copyOf(requireNonNull(positionsList, "positionsList is null")); } public int getIndex() { return index; } public boolean hasNextPosition() { return index < positionsList.size(); } public int nextPosition() { if (!hasNextPosition()) { throw new InvalidCheckpointException( "Not enough positions for column %s and sequence %s, of type %s, checkpoints", column, sequence, columnType); } return positionsList.get(index++); } } private static class ColumnAndSequence { private final int column; private final int sequence; private ColumnAndSequence(int column, int sequence) { this.column = column; this.sequence = sequence; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } ColumnAndSequence that = (ColumnAndSequence) o; return column == that.column && sequence == that.sequence; } @Override public int hashCode() { return Objects.hash(column, sequence); } } }