org.apache.hadoop.hive.ql.io.orc.FileDump.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.io.orc.FileDump.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.orc;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONWriter;

/**
 * A tool for printing out the file structure of ORC files.
 */
public final class FileDump {
    public static final String UNKNOWN = "UNKNOWN";

    // not used
    private FileDump() {
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        List<Integer> rowIndexCols = null;
        Options opts = createOptions();
        CommandLine cli = new GnuParser().parse(opts, args);

        if (cli.hasOption('h')) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("orcfiledump", opts);
            return;
        }

        boolean dumpData = cli.hasOption('d');
        if (cli.hasOption("r")) {
            String[] colStrs = cli.getOptionValue("r").split(",");
            rowIndexCols = new ArrayList<Integer>(colStrs.length);
            for (String colStr : colStrs) {
                rowIndexCols.add(Integer.parseInt(colStr));
            }
        }

        boolean printTimeZone = cli.hasOption('t');
        boolean jsonFormat = cli.hasOption('j');
        String[] files = cli.getArgs();
        if (dumpData) {
            printData(Arrays.asList(files), conf);
        } else {
            if (jsonFormat) {
                boolean prettyPrint = cli.hasOption('p');
                JsonFileDump.printJsonMetaData(Arrays.asList(files), conf, rowIndexCols, prettyPrint,
                        printTimeZone);
            } else {
                printMetaData(Arrays.asList(files), conf, rowIndexCols, printTimeZone);
            }
        }
    }

    private static void printData(List<String> files, Configuration conf) throws IOException, JSONException {
        for (String file : files) {
            printJsonData(conf, file);
        }
    }

    private static void printMetaData(List<String> files, Configuration conf, List<Integer> rowIndexCols,
            boolean printTimeZone) throws IOException {
        for (String filename : files) {
            System.out.println("Structure for " + filename);
            Path path = new Path(filename);
            Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
            System.out.println(
                    "File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion());
            RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
            System.out.println("Rows: " + reader.getNumberOfRows());
            System.out.println("Compression: " + reader.getCompression());
            if (reader.getCompression() != CompressionKind.NONE) {
                System.out.println("Compression size: " + reader.getCompressionSize());
            }
            System.out.println("Type: " + reader.getObjectInspector().getTypeName());
            System.out.println("\nStripe Statistics:");
            Metadata metadata = reader.getMetadata();
            for (int n = 0; n < metadata.getStripeStatistics().size(); n++) {
                System.out.println("  Stripe " + (n + 1) + ":");
                StripeStatistics ss = metadata.getStripeStatistics().get(n);
                for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
                    System.out.println("    Column " + i + ": " + ss.getColumnStatistics()[i].toString());
                }
            }
            ColumnStatistics[] stats = reader.getStatistics();
            int colCount = stats.length;
            System.out.println("\nFile Statistics:");
            for (int i = 0; i < stats.length; ++i) {
                System.out.println("  Column " + i + ": " + stats[i].toString());
            }
            System.out.println("\nStripes:");
            int stripeIx = -1;
            for (StripeInformation stripe : reader.getStripes()) {
                ++stripeIx;
                long stripeStart = stripe.getOffset();
                OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
                if (printTimeZone) {
                    String tz = footer.getWriterTimezone();
                    if (tz == null || tz.isEmpty()) {
                        tz = UNKNOWN;
                    }
                    System.out.println("  Stripe: " + stripe.toString() + " timezone: " + tz);
                } else {
                    System.out.println("  Stripe: " + stripe.toString());
                }
                long sectionStart = stripeStart;
                for (OrcProto.Stream section : footer.getStreamsList()) {
                    String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
                    System.out.println("    Stream: column " + section.getColumn() + " section " + kind + " start: "
                            + sectionStart + " length " + section.getLength());
                    sectionStart += section.getLength();
                }
                for (int i = 0; i < footer.getColumnsCount(); ++i) {
                    OrcProto.ColumnEncoding encoding = footer.getColumns(i);
                    StringBuilder buf = new StringBuilder();
                    buf.append("    Encoding column ");
                    buf.append(i);
                    buf.append(": ");
                    buf.append(encoding.getKind());
                    if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY
                            || encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
                        buf.append("[");
                        buf.append(encoding.getDictionarySize());
                        buf.append("]");
                    }
                    System.out.println(buf);
                }
                if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
                    // include the columns that are specified, only if the columns are included, bloom filter
                    // will be read
                    boolean[] sargColumns = new boolean[colCount];
                    for (int colIdx : rowIndexCols) {
                        sargColumns[colIdx] = true;
                    }
                    RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns);
                    for (int col : rowIndexCols) {
                        StringBuilder buf = new StringBuilder();
                        String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
                        buf.append(rowIdxString);
                        String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
                        buf.append(bloomFilString);
                        System.out.println(buf);
                    }
                }
            }

            FileSystem fs = path.getFileSystem(conf);
            long fileLen = fs.getContentSummary(path).getLength();
            long paddedBytes = getTotalPaddingSize(reader);
            // empty ORC file is ~45 bytes. Assumption here is file length always >0
            double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
            DecimalFormat format = new DecimalFormat("##.##");
            System.out.println("\nFile length: " + fileLen + " bytes");
            System.out.println("Padding length: " + paddedBytes + " bytes");
            System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
            rows.close();
        }
    }

    private static String getFormattedBloomFilters(int col, OrcProto.BloomFilterIndex[] bloomFilterIndex) {
        StringBuilder buf = new StringBuilder();
        BloomFilterIO stripeLevelBF = null;
        if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
            int idx = 0;
            buf.append("\n    Bloom filters for column ").append(col).append(":");
            for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
                BloomFilterIO toMerge = new BloomFilterIO(bf);
                buf.append("\n      Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
                if (stripeLevelBF == null) {
                    stripeLevelBF = toMerge;
                } else {
                    stripeLevelBF.merge(toMerge);
                }
            }
            String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
            buf.append("\n      Stripe level merge:").append(bloomFilterStats);
        }
        return buf.toString();
    }

    private static String getBloomFilterStats(BloomFilterIO bf) {
        StringBuilder sb = new StringBuilder();
        int bitCount = bf.getBitSize();
        int popCount = 0;
        for (long l : bf.getBitSet()) {
            popCount += Long.bitCount(l);
        }
        int k = bf.getNumHashFunctions();
        float loadFactor = (float) popCount / (float) bitCount;
        float expectedFpp = (float) Math.pow(loadFactor, k);
        DecimalFormat df = new DecimalFormat("###.####");
        sb.append(" numHashFunctions: ").append(k);
        sb.append(" bitCount: ").append(bitCount);
        sb.append(" popCount: ").append(popCount);
        sb.append(" loadFactor: ").append(df.format(loadFactor));
        sb.append(" expectedFpp: ").append(expectedFpp);
        return sb.toString();
    }

    private static String getFormattedRowIndices(int col, RowIndex[] rowGroupIndex) {
        StringBuilder buf = new StringBuilder();
        RowIndex index;
        buf.append("    Row group indices for column ").append(col).append(":");
        if (rowGroupIndex == null || (col >= rowGroupIndex.length) || ((index = rowGroupIndex[col]) == null)) {
            buf.append(" not found\n");
            return buf.toString();
        }

        for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
            buf.append("\n      Entry ").append(entryIx).append(": ");
            RowIndexEntry entry = index.getEntry(entryIx);
            if (entry == null) {
                buf.append("unknown\n");
                continue;
            }
            OrcProto.ColumnStatistics colStats = entry.getStatistics();
            if (colStats == null) {
                buf.append("no stats at ");
            } else {
                ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
                buf.append(cs.toString());
            }
            buf.append(" positions: ");
            for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
                if (posIx != 0) {
                    buf.append(",");
                }
                buf.append(entry.getPositions(posIx));
            }
        }
        return buf.toString();
    }

    public static long getTotalPaddingSize(Reader reader) throws IOException {
        long paddedBytes = 0;
        List<org.apache.hadoop.hive.ql.io.orc.StripeInformation> stripes = reader.getStripes();
        for (int i = 1; i < stripes.size(); i++) {
            long prevStripeOffset = stripes.get(i - 1).getOffset();
            long prevStripeLen = stripes.get(i - 1).getLength();
            paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
        }
        return paddedBytes;
    }

    static Options createOptions() {
        Options result = new Options();

        // add -d and --data to print the rows
        result.addOption(
                OptionBuilder.withLongOpt("data").withDescription("Should the data be printed").create('d'));

        // to avoid breaking unit tests (when run in different time zones) for file dump, printing
        // of timezone is made optional
        result.addOption(
                OptionBuilder.withLongOpt("timezone").withDescription("Print writer's time zone").create('t'));

        result.addOption(OptionBuilder.withLongOpt("help").withDescription("print help message").create('h'));

        result.addOption(OptionBuilder.withLongOpt("rowindex")
                .withArgName("comma separated list of column ids for which row index should be printed")
                .withDescription("Dump stats for column number(s)").hasArg().create('r'));

        result.addOption(
                OptionBuilder.withLongOpt("json").withDescription("Print metadata in JSON format").create('j'));

        result.addOption(OptionBuilder.withLongOpt("pretty").withDescription("Pretty print json metadata output")
                .create('p'));

        return result;
    }

    private static void printMap(JSONWriter writer, Map<Object, Object> obj, List<OrcProto.Type> types,
            OrcProto.Type type) throws IOException, JSONException {
        writer.array();
        int keyType = type.getSubtypes(0);
        int valueType = type.getSubtypes(1);
        for (Map.Entry<Object, Object> item : obj.entrySet()) {
            writer.object();
            writer.key("_key");
            printObject(writer, item.getKey(), types, keyType);
            writer.key("_value");
            printObject(writer, item.getValue(), types, valueType);
            writer.endObject();
        }
        writer.endArray();
    }

    private static void printList(JSONWriter writer, List<Object> obj, List<OrcProto.Type> types,
            OrcProto.Type type) throws IOException, JSONException {
        int subtype = type.getSubtypes(0);
        writer.array();
        for (Object item : obj) {
            printObject(writer, item, types, subtype);
        }
        writer.endArray();
    }

    private static void printUnion(JSONWriter writer, OrcUnion obj, List<OrcProto.Type> types, OrcProto.Type type)
            throws IOException, JSONException {
        int subtype = type.getSubtypes(obj.getTag());
        printObject(writer, obj.getObject(), types, subtype);
    }

    static void printStruct(JSONWriter writer, OrcStruct obj, List<OrcProto.Type> types, OrcProto.Type type)
            throws IOException, JSONException {
        writer.object();
        List<Integer> fieldTypes = type.getSubtypesList();
        for (int i = 0; i < fieldTypes.size(); ++i) {
            writer.key(type.getFieldNames(i));
            printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i));
        }
        writer.endObject();
    }

    static void printObject(JSONWriter writer, Object obj, List<OrcProto.Type> types, int typeId)
            throws IOException, JSONException {
        OrcProto.Type type = types.get(typeId);
        if (obj == null) {
            writer.value(null);
        } else {
            switch (type.getKind()) {
            case STRUCT:
                printStruct(writer, (OrcStruct) obj, types, type);
                break;
            case UNION:
                printUnion(writer, (OrcUnion) obj, types, type);
                break;
            case LIST:
                printList(writer, (List<Object>) obj, types, type);
                break;
            case MAP:
                printMap(writer, (Map<Object, Object>) obj, types, type);
                break;
            case BYTE:
                writer.value(((ByteWritable) obj).get());
                break;
            case SHORT:
                writer.value(((ShortWritable) obj).get());
                break;
            case INT:
                writer.value(((IntWritable) obj).get());
                break;
            case LONG:
                writer.value(((LongWritable) obj).get());
                break;
            case FLOAT:
                writer.value(((FloatWritable) obj).get());
                break;
            case DOUBLE:
                writer.value(((DoubleWritable) obj).get());
                break;
            case BOOLEAN:
                writer.value(((BooleanWritable) obj).get());
                break;
            default:
                writer.value(obj.toString());
                break;
            }
        }
    }

    static void printJsonData(Configuration conf, String filename) throws IOException, JSONException {
        Path path = new Path(filename);
        Reader reader = OrcFile.createReader(path.getFileSystem(conf), path);
        PrintStream printStream = System.out;
        OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
        RecordReader rows = reader.rows(null);
        Object row = null;
        List<OrcProto.Type> types = reader.getTypes();
        while (rows.hasNext()) {
            row = rows.next(row);
            JSONWriter writer = new JSONWriter(out);
            printObject(writer, row, types, 0);
            out.write("\n");
            out.flush();
            if (printStream.checkError()) {
                throw new IOException("Error encountered when writing to stdout.");
            }
        }
    }
}