Java tutorial
/** * Copyright 2012 Twitter, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package parquet.hadoop; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import parquet.Log; import parquet.column.Encoding; import parquet.hadoop.metadata.BlockMetaData; import parquet.hadoop.metadata.ColumnChunkMetaData; import parquet.hadoop.metadata.ColumnPath; import parquet.hadoop.metadata.CompressionCodecName; import parquet.schema.PrimitiveType.PrimitiveTypeName; /** * An input split for the Parquet format * It contains the information to read one block of the file. * * @author Julien Le Dem */ public class ParquetInputSplit extends FileSplit implements Writable { private static final Log LOG = Log.getLog(ParquetInputSplit.class); private List<BlockMetaData> blocks; private String requestedSchema; private String fileSchema; private Map<String, String> extraMetadata; private Map<String, String> readSupportMetadata; /** * Writables must have a parameterless constructor */ public ParquetInputSplit() { super(null, 0, 0, new String[0]); } /** * Used by {@link ParquetInputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} * @param path the path to the file * @param start the offset of the block in the file * @param length the size of the block in the file * @param hosts the hosts where this block can be found * @param blocks the block meta data (Columns locations) * @param schema the file schema * @param readSupportClass the class used to materialize records * @param requestedSchema the requested schema for materialization * @param fileSchema the schema of the file * @param extraMetadata the app specific meta data in the file * @param readSupportMetadata the read support specific metadata */ public ParquetInputSplit(Path path, long start, long length, String[] hosts, List<BlockMetaData> blocks, String requestedSchema, String fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata) { super(path, start, length, hosts); this.blocks = blocks; this.requestedSchema = requestedSchema; this.fileSchema = fileSchema; this.extraMetadata = extraMetadata; this.readSupportMetadata = readSupportMetadata; } /** * @return the block meta data */ public List<BlockMetaData> getBlocks() { return blocks; } /** * @return the requested schema */ public String getRequestedSchema() { return requestedSchema; } /** * @return the file schema */ public String getFileSchema() { return fileSchema; } /** * @return app specific metadata from the file */ public Map<String, String> getExtraMetadata() { return extraMetadata; } /** * @return app specific metadata provided by the read support in the init phase */ public Map<String, String> getReadSupportMetadata() { return readSupportMetadata; } /** * {@inheritDoc} */ @Override public void readFields(DataInput in) throws IOException { super.readFields(in); int blocksSize = in.readInt(); this.blocks = new ArrayList<BlockMetaData>(blocksSize); for (int i = 0; i < blocksSize; i++) { blocks.add(readBlock(in)); } this.requestedSchema = decompressString(Text.readString(in)); this.fileSchema = decompressString(Text.readString(in)); this.extraMetadata = readKeyValues(in); this.readSupportMetadata = readKeyValues(in); } /** * {@inheritDoc} */ @Override public void write(DataOutput out) throws IOException { super.write(out); out.writeInt(blocks.size()); for (BlockMetaData block : blocks) { writeBlock(out, block); } Text.writeString(out, compressString(requestedSchema)); Text.writeString(out, compressString(fileSchema)); writeKeyValues(out, extraMetadata); writeKeyValues(out, readSupportMetadata); } String compressString(String str) { ByteArrayOutputStream obj = new ByteArrayOutputStream(); GZIPOutputStream gzip; try { gzip = new GZIPOutputStream(obj); gzip.write(str.getBytes("UTF-8")); gzip.close(); } catch (IOException e) { // Not really sure how we can get here. I guess the best thing to do is to croak. LOG.error("Unable to gzip InputSplit string " + str, e); throw new RuntimeException("Unable to gzip InputSplit string", e); } String compressedStr = Base64.encodeBase64String(obj.toByteArray()); return compressedStr; } String decompressString(String str) { byte[] decoded = Base64.decodeBase64(str); ByteArrayInputStream obj = new ByteArrayInputStream(decoded); GZIPInputStream gzip = null; String outStr = ""; try { gzip = new GZIPInputStream(obj); BufferedReader reader = new BufferedReader(new InputStreamReader(gzip, "UTF-8")); char[] buffer = new char[1024]; int n = 0; StringBuilder sb = new StringBuilder(); while (-1 != (n = reader.read(buffer))) { sb.append(buffer, 0, n); } outStr = sb.toString(); } catch (IOException e) { // Not really sure how we can get here. I guess the best thing to do is to croak. LOG.error("Unable to uncompress InputSplit string " + str, e); throw new RuntimeException("Unable to uncompress InputSplit String", e); } finally { if (null != gzip) { try { gzip.close(); } catch (IOException e) { LOG.error("Unable to uncompress InputSplit string " + str, e); throw new RuntimeException("Unable to uncompress InputSplit String", e); } } } return outStr; } private BlockMetaData readBlock(DataInput in) throws IOException { final BlockMetaData block = new BlockMetaData(); int size = in.readInt(); for (int i = 0; i < size; i++) { block.addColumn(readColumn(in)); } block.setRowCount(in.readLong()); block.setTotalByteSize(in.readLong()); if (!in.readBoolean()) { block.setPath(in.readUTF().intern()); } return block; } private void writeBlock(DataOutput out, BlockMetaData block) throws IOException { out.writeInt(block.getColumns().size()); for (ColumnChunkMetaData column : block.getColumns()) { writeColumn(out, column); } out.writeLong(block.getRowCount()); out.writeLong(block.getTotalByteSize()); out.writeBoolean(block.getPath() == null); if (block.getPath() != null) { out.writeUTF(block.getPath()); } } private ColumnChunkMetaData readColumn(DataInput in) throws IOException { CompressionCodecName codec = CompressionCodecName.values()[in.readInt()]; String[] columnPath = new String[in.readInt()]; for (int i = 0; i < columnPath.length; i++) { columnPath[i] = in.readUTF().intern(); } PrimitiveTypeName type = PrimitiveTypeName.values()[in.readInt()]; int encodingsSize = in.readInt(); Set<Encoding> encodings = new HashSet<Encoding>(encodingsSize); for (int i = 0; i < encodingsSize; i++) { encodings.add(Encoding.values()[in.readInt()]); } ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get(columnPath), type, codec, encodings, in.readLong(), in.readLong(), in.readLong(), in.readLong(), in.readLong()); return column; } private void writeColumn(DataOutput out, ColumnChunkMetaData column) throws IOException { out.writeInt(column.getCodec().ordinal()); out.writeInt(column.getPath().size()); for (String s : column.getPath()) { out.writeUTF(s); } out.writeInt(column.getType().ordinal()); out.writeInt(column.getEncodings().size()); for (Encoding encoding : column.getEncodings()) { out.writeInt(encoding.ordinal()); } out.writeLong(column.getFirstDataPageOffset()); out.writeLong(column.getDictionaryPageOffset()); out.writeLong(column.getValueCount()); out.writeLong(column.getTotalSize()); out.writeLong(column.getTotalUncompressedSize()); } private Map<String, String> readKeyValues(DataInput in) throws IOException { int size = in.readInt(); Map<String, String> map = new HashMap<String, String>(size); for (int i = 0; i < size; i++) { String key = in.readUTF().intern(); String value = in.readUTF().intern(); map.put(key, value); } return map; } private void writeKeyValues(DataOutput out, Map<String, String> map) throws IOException { if (map == null) { out.writeInt(0); } else { out.writeInt(map.size()); for (Entry<String, String> entry : map.entrySet()) { out.writeUTF(entry.getKey()); out.writeUTF(entry.getValue()); } } } @Override public String toString() { String hosts[] = {}; try { hosts = getLocations(); } catch (Exception ignore) { } // IOException/InterruptedException could be thrown return this.getClass().getSimpleName() + "{" + "part: " + getPath() + " start: " + getStart() + " length: " + getLength() + " hosts: " + Arrays.toString(hosts) + " blocks: " + blocks.size() + " requestedSchema: " + (fileSchema.equals(requestedSchema) ? "same as file" : requestedSchema) + " fileSchema: " + fileSchema + " extraMetadata: " + extraMetadata + " readSupportMetadata: " + readSupportMetadata + "}"; } }