Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.parquet.tools.command; import java.io.IOException; import java.math.BigInteger; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.column.impl.ColumnReadStoreImpl; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.DataPage.Visitor; import org.apache.parquet.column.page.DataPageV1; import org.apache.parquet.column.page.DataPageV2; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.column.page.PageReader; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.Converter; import org.apache.parquet.io.api.GroupConverter; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveStringifier; import org.apache.parquet.tools.util.MetadataUtils; import org.apache.parquet.tools.util.PrettyPrintWriter; import org.apache.parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler; import com.google.common.base.Joiner; import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; public class DumpCommand extends ArgsOnlyCommand { private static final Charset UTF8 = Charset.forName("UTF-8"); private static final CharsetDecoder UTF8_DECODER = UTF8.newDecoder(); public static final String TABS = " "; public static final int BLOCK_BUFFER_SIZE = 64 * 1024; public static final String[] USAGE = new String[] { "<input>", "where <input> is the parquet file to print to stdout" }; public static final Options OPTIONS; static { OPTIONS = new Options(); Option md = OptionBuilder.withLongOpt("disable-meta") .withDescription("Do not dump row group and page metadata").create('m'); Option dt = OptionBuilder.withLongOpt("disable-data").withDescription("Do not dump column data") .create('d'); Option nocrop = OptionBuilder.withLongOpt("disable-crop") .withDescription("Do not crop the output based on console width").create('n'); Option cl = OptionBuilder.withLongOpt("column") .withDescription("Dump only the given column, can be specified more than once").hasArg() .create('c'); OPTIONS.addOption(md); OPTIONS.addOption(dt); OPTIONS.addOption(nocrop); OPTIONS.addOption(cl); } public DumpCommand() { super(1, 1); } @Override public Options getOptions() { return OPTIONS; } @Override public String[] getUsageDescription() { return USAGE; } @Override public String getCommandDescription() { return "Prints the content and metadata of a Parquet file"; } @Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; Configuration conf = new Configuration(); Path inpath = new Path(input); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); boolean showmd = !options.hasOption('m'); boolean showdt = !options.hasOption('d'); boolean cropoutput = !options.hasOption('n'); Set<String> showColumns = null; if (options.hasOption('c')) { String[] cols = options.getOptionValues('c'); showColumns = new HashSet<String>(Arrays.asList(cols)); } PrettyPrintWriter out = prettyPrintWriter(cropoutput); dump(out, metaData, schema, inpath, showmd, showdt, showColumns); } public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException { Configuration conf = new Configuration(); List<BlockMetaData> blocks = meta.getBlocks(); List<ColumnDescriptor> columns = schema.getColumns(); if (showColumns != null) { columns = new ArrayList<ColumnDescriptor>(); for (ColumnDescriptor column : schema.getColumns()) { String path = Joiner.on('.').skipNulls().join(column.getPath()); if (showColumns.contains(path)) { columns.add(column); } } } ParquetFileReader freader = null; if (showmd) { try { long group = 0; for (BlockMetaData block : blocks) { if (group != 0) out.println(); out.format("row group %d%n", group++); out.rule('-'); List<ColumnChunkMetaData> ccmds = block.getColumns(); if (showColumns != null) { ccmds = new ArrayList<ColumnChunkMetaData>(); for (ColumnChunkMetaData ccmd : block.getColumns()) { String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray()); if (showColumns.contains(path)) { ccmds.add(ccmd); } } } MetadataUtils.showDetails(out, ccmds); List<BlockMetaData> rblocks = Collections.singletonList(block); freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns); PageReadStore store = freader.readNextRowGroup(); while (store != null) { out.incrementTabLevel(); for (ColumnDescriptor column : columns) { out.println(); dump(out, store, column); } out.decrementTabLevel(); store = freader.readNextRowGroup(); } out.flushColumns(); } } finally { if (freader != null) { freader.close(); } } } if (showdt) { boolean first = true; for (ColumnDescriptor column : columns) { if (!first || showmd) out.println(); first = false; out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath())); out.rule('-'); try { long page = 1; long total = blocks.size(); long offset = 1; freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column)); PageReadStore store = freader.readNextRowGroup(); while (store != null) { ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy()); dump(out, crstore, column, page++, total, offset); offset += store.getRowCount(); store = freader.readNextRowGroup(); } out.flushColumns(); } finally { out.flushColumns(); if (freader != null) { freader.close(); } } } } } public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException { PageReader reader = store.getPageReader(column); long vc = reader.getTotalValueCount(); int rmax = column.getMaxRepetitionLevel(); int dmax = column.getMaxDefinitionLevel(); out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax); DictionaryPage dict = reader.readDictionaryPage(); if (dict != null) { out.format(" DS:%d", dict.getDictionarySize()); out.format(" DE:%s", dict.getEncoding()); } out.println(); out.rule('-'); DataPage page = reader.readPage(); for (long count = 0; page != null; count++) { out.format("page %d:", count); page.accept(new Visitor<Void>() { @Override public Void visit(DataPageV1 pageV1) { out.format(" DLE:%s", pageV1.getDlEncoding()); out.format(" RLE:%s", pageV1.getRlEncoding()); out.format(" VLE:%s", pageV1.getValueEncoding()); Statistics<?> statistics = pageV1.getStatistics(); if (statistics != null) { out.format(" ST:[%s]", statistics); } else { out.format(" ST:[none]"); } return null; } @Override public Void visit(DataPageV2 pageV2) { out.format(" DLE:RLE"); out.format(" RLE:RLE"); out.format(" VLE:%s", pageV2.getDataEncoding()); Statistics<?> statistics = pageV2.getStatistics(); if (statistics != null) { out.format(" ST:[%s]", statistics); } else { out.format(" ST:[none]"); } return null; } }); out.format(" SZ:%d", page.getUncompressedSize()); out.format(" VC:%d", page.getValueCount()); out.println(); page = reader.readPage(); } } public static void dump(PrettyPrintWriter out, ColumnReadStoreImpl crstore, ColumnDescriptor column, long page, long total, long offset) throws IOException { int dmax = column.getMaxDefinitionLevel(); ColumnReader creader = crstore.getColumnReader(column); out.format("*** row group %d of %d, values %d to %d ***%n", page, total, offset, offset + creader.getTotalValueCount() - 1); for (long i = 0, e = creader.getTotalValueCount(); i < e; ++i) { int rlvl = creader.getCurrentRepetitionLevel(); int dlvl = creader.getCurrentDefinitionLevel(); out.format("value %d: R:%d D:%d V:", offset + i, rlvl, dlvl); if (dlvl == dmax) { PrimitiveStringifier stringifier = column.getPrimitiveType().stringifier(); switch (column.getType()) { case FIXED_LEN_BYTE_ARRAY: case INT96: case BINARY: out.print(stringifier.stringify(creader.getBinary())); break; case BOOLEAN: out.print(stringifier.stringify(creader.getBoolean())); break; case DOUBLE: out.print(stringifier.stringify(creader.getDouble())); break; case FLOAT: out.print(stringifier.stringify(creader.getFloat())); break; case INT32: out.print(stringifier.stringify(creader.getInteger())); break; case INT64: out.print(stringifier.stringify(creader.getLong())); break; } } else { out.format("<null>"); } out.println(); creader.consume(); } } public static String binaryToString(Binary value) { byte[] data = value.getBytesUnsafe(); if (data == null) return null; try { CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer()); return buffer.toString(); } catch (Exception ex) { } return "<bytes...>"; } public static BigInteger binaryToBigInteger(Binary value) { byte[] data = value.getBytesUnsafe(); if (data == null) return null; return new BigInteger(data); } private static PrettyPrintWriter prettyPrintWriter(boolean cropOutput) { PrettyPrintWriter.Builder builder = PrettyPrintWriter.stdoutPrettyPrinter().withAutoColumn() .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES).withColumnPadding(1) .withMaxBufferedLines(1000000).withFlushOnTab(); if (cropOutput) { builder.withAutoCrop(); } return builder.build(); } private static final class DumpGroupConverter extends GroupConverter { @Override public void start() { } @Override public void end() { } @Override public Converter getConverter(int fieldIndex) { return new DumpConverter(); } } private static final class DumpConverter extends PrimitiveConverter { @Override public GroupConverter asGroupConverter() { return new DumpGroupConverter(); } } }