org.apache.orc.tools.FileDump.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.orc.tools.FileDump.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc.tools;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.orc.BloomFilterIO;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.impl.OrcAcidUtils;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.OrcProto;
import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.apache.orc.impl.RecordReaderImpl;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONWriter;

import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;

/**
 * A tool for printing out the file structure of ORC files.
 */
public final class FileDump {
    public static final String UNKNOWN = "UNKNOWN";
    public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
    public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
    public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
    public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".")
                    && !name.endsWith(OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
        }
    };

    // not used
    private FileDump() {
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        List<Integer> rowIndexCols = new ArrayList<Integer>(0);
        Options opts = createOptions();
        CommandLine cli = new GnuParser().parse(opts, args);

        if (cli.hasOption('h')) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("orcfiledump", opts);
            return;
        }

        boolean dumpData = cli.hasOption('d');
        boolean recover = cli.hasOption("recover");
        boolean skipDump = cli.hasOption("skip-dump");
        String backupPath = DEFAULT_BACKUP_PATH;
        if (cli.hasOption("backup-path")) {
            backupPath = cli.getOptionValue("backup-path");
        }

        if (cli.hasOption("r")) {
            String val = cli.getOptionValue("r");
            if (val != null && val.trim().equals("*")) {
                rowIndexCols = null; // All the columns
            } else {
                String[] colStrs = cli.getOptionValue("r").split(",");
                rowIndexCols = new ArrayList<Integer>(colStrs.length);
                for (String colStr : colStrs) {
                    rowIndexCols.add(Integer.parseInt(colStr));
                }
            }
        }

        boolean printTimeZone = cli.hasOption('t');
        boolean jsonFormat = cli.hasOption('j');
        String[] files = cli.getArgs();
        if (files.length == 0) {
            System.err.println("Error : ORC files are not specified");
            return;
        }

        // if the specified path is directory, iterate through all files and print the file dump
        List<String> filesInPath = Lists.newArrayList();
        for (String filename : files) {
            Path path = new Path(filename);
            filesInPath.addAll(getAllFilesInPath(path, conf));
        }

        if (dumpData) {
            printData(filesInPath, conf);
        } else if (recover && skipDump) {
            recoverFiles(filesInPath, conf, backupPath);
        } else {
            if (jsonFormat) {
                boolean prettyPrint = cli.hasOption('p');
                JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
            } else {
                printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
            }
        }
    }

    /**
     * This method returns an ORC reader object if the specified file is readable. If the specified
     * file has side file (_flush_length) file, then max footer offset will be read from the side
     * file and orc reader will be created from that offset. Since both data file and side file
     * use hflush() for flushing the data, there could be some inconsistencies and both files could be
     * out-of-sync. Following are the cases under which null will be returned
     *
     * 1) If the file specified by path or its side file is still open for writes
     * 2) If *_flush_length file does not return any footer offset
     * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
     *    position (incomplete data file)
     * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
     *    previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
     *    be returned
     *
     * Also, if this method detects any file corruption (mismatch between data file and side file)
     * then it will add the corresponding file to the specified input list for corrupted files.
     *
     * In all other cases, where the file is readable this method will return a reader object.
     *
     * @param path - file to get reader for
     * @param conf - configuration object
     * @param corruptFiles - fills this list with all possible corrupted files
     * @return - reader for the specified file or null
     * @throws IOException
     */
    static Reader getReader(final Path path, final Configuration conf, final List<String> corruptFiles)
            throws IOException {
        FileSystem fs = path.getFileSystem(conf);
        long dataFileLen = fs.getFileStatus(path).getLen();
        System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
        Path sideFile = OrcAcidUtils.getSideFile(path);
        final boolean sideFileExists = fs.exists(sideFile);
        boolean openDataFile = false;
        boolean openSideFile = false;
        if (fs instanceof DistributedFileSystem) {
            DistributedFileSystem dfs = (DistributedFileSystem) fs;
            openDataFile = !dfs.isFileClosed(path);
            openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
        }

        if (openDataFile || openSideFile) {
            if (openDataFile && openSideFile) {
                System.err.println("Unable to perform file dump as " + path + " and " + sideFile
                        + " are still open for writes.");
            } else if (openSideFile) {
                System.err.println("Unable to perform file dump as " + sideFile + " is still open for writes.");
            } else {
                System.err.println("Unable to perform file dump as " + path + " is still open for writes.");
            }

            return null;
        }

        Reader reader = null;
        if (sideFileExists) {
            final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
            final long sideFileLen = fs.getFileStatus(sideFile).getLen();
            System.err.println("Found flush length file " + sideFile + " [length: " + sideFileLen
                    + ", maxFooterOffset: " + maxLen + "]");
            // no offsets read from side file
            if (maxLen == -1) {

                // if data file is larger than last flush length, then additional data could be recovered
                if (dataFileLen > maxLen) {
                    System.err.println("Data file has more data than max footer offset:" + maxLen
                            + ". Adding data file to recovery list.");
                    if (corruptFiles != null) {
                        corruptFiles.add(path.toUri().toString());
                    }
                }
                return null;
            }

            try {
                reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));

                // if data file is larger than last flush length, then additional data could be recovered
                if (dataFileLen > maxLen) {
                    System.err.println("Data file has more data than max footer offset:" + maxLen
                            + ". Adding data file to recovery list.");
                    if (corruptFiles != null) {
                        corruptFiles.add(path.toUri().toString());
                    }
                }
            } catch (Exception e) {
                if (corruptFiles != null) {
                    corruptFiles.add(path.toUri().toString());
                }
                System.err.println(
                        "Unable to read data from max footer offset." + " Adding data file to recovery list.");
                return null;
            }
        } else {
            reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
        }

        return reader;
    }

    public static Collection<String> getAllFilesInPath(final Path path, final Configuration conf)
            throws IOException {
        List<String> filesInPath = Lists.newArrayList();
        FileSystem fs = path.getFileSystem(conf);
        FileStatus fileStatus = fs.getFileStatus(path);
        if (fileStatus.isDir()) {
            FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
            for (FileStatus fileInPath : fileStatuses) {
                if (fileInPath.isDir()) {
                    filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
                } else {
                    filesInPath.add(fileInPath.getPath().toString());
                }
            }
        } else {
            filesInPath.add(path.toString());
        }

        return filesInPath;
    }

    private static void printData(List<String> files, Configuration conf) throws IOException, JSONException {
        for (String file : files) {
            try {
                Path path = new Path(file);
                Reader reader = getReader(path, conf, Lists.<String>newArrayList());
                if (reader == null) {
                    continue;
                }
                printJsonData(reader);
                System.out.println(SEPARATOR);
            } catch (Exception e) {
                System.err.println("Unable to dump data for file: " + file);
                continue;
            }
        }
    }

    private static void printMetaData(List<String> files, Configuration conf, List<Integer> rowIndexCols,
            boolean printTimeZone, final boolean recover, final String backupPath) throws IOException {
        List<String> corruptFiles = Lists.newArrayList();
        for (String filename : files) {
            printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
            System.out.println(SEPARATOR);
        }

        if (!corruptFiles.isEmpty()) {
            if (recover) {
                recoverFiles(corruptFiles, conf, backupPath);
            } else {
                System.err.println(corruptFiles.size() + " file(s) are corrupted."
                        + " Run the following command to recover corrupted files.\n");
                String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
                System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
                System.out.println(SEPARATOR);
            }
        }
    }

    private static void printMetaDataImpl(final String filename, final Configuration conf,
            List<Integer> rowIndexCols, final boolean printTimeZone, final List<String> corruptFiles)
            throws IOException {
        Path file = new Path(filename);
        Reader reader = getReader(file, conf, corruptFiles);
        // if we can create reader then footer is not corrupt and file will readable
        if (reader == null) {
            return;
        }

        System.out.println("Structure for " + filename);
        System.out.println(
                "File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion());
        RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
        System.out.println("Rows: " + reader.getNumberOfRows());
        System.out.println("Compression: " + reader.getCompressionKind());
        if (reader.getCompressionKind() != CompressionKind.NONE) {
            System.out.println("Compression size: " + reader.getCompressionSize());
        }
        System.out.println("Type: " + reader.getSchema().toString());
        System.out.println("\nStripe Statistics:");
        List<StripeStatistics> stripeStats = reader.getStripeStatistics();
        for (int n = 0; n < stripeStats.size(); n++) {
            System.out.println("  Stripe " + (n + 1) + ":");
            StripeStatistics ss = stripeStats.get(n);
            for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
                System.out.println("    Column " + i + ": " + ss.getColumnStatistics()[i].toString());
            }
        }
        ColumnStatistics[] stats = reader.getStatistics();
        int colCount = stats.length;
        if (rowIndexCols == null) {
            rowIndexCols = new ArrayList<>(colCount);
            for (int i = 0; i < colCount; ++i) {
                rowIndexCols.add(i);
            }
        }
        System.out.println("\nFile Statistics:");
        for (int i = 0; i < stats.length; ++i) {
            System.out.println("  Column " + i + ": " + stats[i].toString());
        }
        System.out.println("\nStripes:");
        int stripeIx = -1;
        for (StripeInformation stripe : reader.getStripes()) {
            ++stripeIx;
            long stripeStart = stripe.getOffset();
            OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
            if (printTimeZone) {
                String tz = footer.getWriterTimezone();
                if (tz == null || tz.isEmpty()) {
                    tz = UNKNOWN;
                }
                System.out.println("  Stripe: " + stripe.toString() + " timezone: " + tz);
            } else {
                System.out.println("  Stripe: " + stripe.toString());
            }
            long sectionStart = stripeStart;
            for (OrcProto.Stream section : footer.getStreamsList()) {
                String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
                System.out.println("    Stream: column " + section.getColumn() + " section " + kind + " start: "
                        + sectionStart + " length " + section.getLength());
                sectionStart += section.getLength();
            }
            for (int i = 0; i < footer.getColumnsCount(); ++i) {
                OrcProto.ColumnEncoding encoding = footer.getColumns(i);
                StringBuilder buf = new StringBuilder();
                buf.append("    Encoding column ");
                buf.append(i);
                buf.append(": ");
                buf.append(encoding.getKind());
                if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY
                        || encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
                    buf.append("[");
                    buf.append(encoding.getDictionarySize());
                    buf.append("]");
                }
                System.out.println(buf);
            }
            if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
                // include the columns that are specified, only if the columns are included, bloom filter
                // will be read
                boolean[] sargColumns = new boolean[colCount];
                for (int colIdx : rowIndexCols) {
                    sargColumns[colIdx] = true;
                }
                OrcIndex indices = rows.readRowIndex(stripeIx, null, null, null, sargColumns);
                for (int col : rowIndexCols) {
                    StringBuilder buf = new StringBuilder();
                    String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
                    buf.append(rowIdxString);
                    String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
                    buf.append(bloomFilString);
                    System.out.println(buf);
                }
            }
        }

        FileSystem fs = file.getFileSystem(conf);
        long fileLen = fs.getFileStatus(file).getLen();
        long paddedBytes = getTotalPaddingSize(reader);
        // empty ORC file is ~45 bytes. Assumption here is file length always >0
        double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
        DecimalFormat format = new DecimalFormat("##.##");
        System.out.println("\nFile length: " + fileLen + " bytes");
        System.out.println("Padding length: " + paddedBytes + " bytes");
        System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
        AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
        if (acidStats != null) {
            System.out.println("ACID stats:" + acidStats);
        }
        rows.close();
    }

    private static void recoverFiles(final List<String> corruptFiles, final Configuration conf, final String backup)
            throws IOException {
        for (String corruptFile : corruptFiles) {
            System.err.println("Recovering file " + corruptFile);
            Path corruptPath = new Path(corruptFile);
            FileSystem fs = corruptPath.getFileSystem(conf);
            FSDataInputStream fdis = fs.open(corruptPath);
            try {
                long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
                long remaining = corruptFileLen;
                List<Long> footerOffsets = Lists.newArrayList();

                // start reading the data file form top to bottom and record the valid footers
                while (remaining > 0) {
                    int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
                    byte[] data = new byte[toRead];
                    long startPos = corruptFileLen - remaining;
                    fdis.readFully(startPos, data, 0, toRead);

                    // find all MAGIC string and see if the file is readable from there
                    int index = 0;
                    long nextFooterOffset;

                    while (index != -1) {
                        index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
                        if (index != -1) {
                            nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
                            if (isReadable(corruptPath, conf, nextFooterOffset)) {
                                footerOffsets.add(nextFooterOffset);
                            }
                        }
                    }

                    System.err.println("Scanning for valid footers - startPos: " + startPos + " toRead: " + toRead
                            + " remaining: " + remaining);
                    remaining = remaining - toRead;
                }

                System.err.println("Readable footerOffsets: " + footerOffsets);
                recoverFile(corruptPath, fs, conf, footerOffsets, backup);
            } catch (Exception e) {
                Path recoveryFile = getRecoveryFile(corruptPath);
                if (fs.exists(recoveryFile)) {
                    fs.delete(recoveryFile, false);
                }
                System.err.println("Unable to recover file " + corruptFile);
                e.printStackTrace();
                System.err.println(SEPARATOR);
                continue;
            } finally {
                fdis.close();
            }
            System.err.println(corruptFile + " recovered successfully!");
            System.err.println(SEPARATOR);
        }
    }

    private static void recoverFile(final Path corruptPath, final FileSystem fs, final Configuration conf,
            final List<Long> footerOffsets, final String backup) throws IOException {

        // first recover the file to .recovered file and then once successful rename it to actual file
        Path recoveredPath = getRecoveryFile(corruptPath);

        // make sure that file does not exist
        if (fs.exists(recoveredPath)) {
            fs.delete(recoveredPath, false);
        }

        // if there are no valid footers, the file should still be readable so create an empty orc file
        if (footerOffsets == null || footerOffsets.isEmpty()) {
            System.err.println("No readable footers found. Creating empty orc file.");
            TypeDescription schema = TypeDescription.createStruct();
            Writer writer = OrcFile.createWriter(recoveredPath, OrcFile.writerOptions(conf).setSchema(schema));
            writer.close();
        } else {
            FSDataInputStream fdis = fs.open(corruptPath);
            FileStatus fileStatus = fs.getFileStatus(corruptPath);
            // read corrupt file and copy it to recovered file until last valid footer
            FSDataOutputStream fdos = fs.create(recoveredPath, true, conf.getInt("io.file.buffer.size", 4096),
                    fileStatus.getReplication(), fileStatus.getBlockSize());
            try {
                long fileLen = footerOffsets.get(footerOffsets.size() - 1);
                long remaining = fileLen;

                while (remaining > 0) {
                    int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
                    byte[] data = new byte[toRead];
                    long startPos = fileLen - remaining;
                    fdis.readFully(startPos, data, 0, toRead);
                    fdos.write(data);
                    System.err.println("Copying data to recovery file - startPos: " + startPos + " toRead: "
                            + toRead + " remaining: " + remaining);
                    remaining = remaining - toRead;
                }
            } catch (Exception e) {
                fs.delete(recoveredPath, false);
                throw new IOException(e);
            } finally {
                fdis.close();
                fdos.close();
            }
        }

        // validate the recovered file once again and start moving corrupt files to backup folder
        if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
            Path backupDataPath;
            String scheme = corruptPath.toUri().getScheme();
            String authority = corruptPath.toUri().getAuthority();
            String filePath = corruptPath.toUri().getPath();

            // use the same filesystem as corrupt file if backup-path is not explicitly specified
            if (backup.equals(DEFAULT_BACKUP_PATH)) {
                backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
            } else {
                backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
            }

            // Move data file to backup path
            moveFiles(fs, corruptPath, backupDataPath);

            // Move side file to backup path
            Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
            Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
            moveFiles(fs, sideFilePath, backupSideFilePath);

            // finally move recovered file to actual file
            moveFiles(fs, recoveredPath, corruptPath);

            // we are done recovering, backing up and validating
            System.err.println("Validation of recovered file successful!");
        }
    }

    private static void moveFiles(final FileSystem fs, final Path src, final Path dest) throws IOException {
        try {
            // create the dest directory if not exist
            if (!fs.exists(dest.getParent())) {
                fs.mkdirs(dest.getParent());
            }

            // if the destination file exists for some reason delete it
            fs.delete(dest, false);

            if (fs.rename(src, dest)) {
                System.err.println("Moved " + src + " to " + dest);
            } else {
                throw new IOException("Unable to move " + src + " to " + dest);
            }

        } catch (Exception e) {
            throw new IOException("Unable to move " + src + " to " + dest, e);
        }
    }

    private static Path getRecoveryFile(final Path corruptPath) {
        return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
    }

    private static boolean isReadable(final Path corruptPath, final Configuration conf, final long maxLen) {
        try {
            OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
            return true;
        } catch (Exception e) {
            // ignore this exception as maxLen is unreadable
            return false;
        }
    }

    // search for byte pattern in another byte array
    private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
        if (data == null || data.length == 0 || pattern == null || pattern.length == 0 || index > data.length
                || index < 0) {
            return -1;
        }

        int j = 0;
        for (int i = index; i < data.length; i++) {
            if (pattern[j] == data[i]) {
                j++;
            } else {
                j = 0;
            }

            if (j == pattern.length) {
                return i - pattern.length + 1;
            }
        }

        return -1;
    }

    private static String getFormattedBloomFilters(int col, OrcProto.BloomFilterIndex[] bloomFilterIndex) {
        StringBuilder buf = new StringBuilder();
        BloomFilterIO stripeLevelBF = null;
        if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
            int idx = 0;
            buf.append("\n    Bloom filters for column ").append(col).append(":");
            for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
                BloomFilterIO toMerge = new BloomFilterIO(bf);
                buf.append("\n      Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
                if (stripeLevelBF == null) {
                    stripeLevelBF = toMerge;
                } else {
                    stripeLevelBF.merge(toMerge);
                }
            }
            String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
            buf.append("\n      Stripe level merge:").append(bloomFilterStats);
        }
        return buf.toString();
    }

    private static String getBloomFilterStats(BloomFilterIO bf) {
        StringBuilder sb = new StringBuilder();
        int bitCount = bf.getBitSize();
        int popCount = 0;
        for (long l : bf.getBitSet()) {
            popCount += Long.bitCount(l);
        }
        int k = bf.getNumHashFunctions();
        float loadFactor = (float) popCount / (float) bitCount;
        float expectedFpp = (float) Math.pow(loadFactor, k);
        DecimalFormat df = new DecimalFormat("###.####");
        sb.append(" numHashFunctions: ").append(k);
        sb.append(" bitCount: ").append(bitCount);
        sb.append(" popCount: ").append(popCount);
        sb.append(" loadFactor: ").append(df.format(loadFactor));
        sb.append(" expectedFpp: ").append(expectedFpp);
        return sb.toString();
    }

    private static String getFormattedRowIndices(int col, OrcProto.RowIndex[] rowGroupIndex) {
        StringBuilder buf = new StringBuilder();
        OrcProto.RowIndex index;
        buf.append("    Row group indices for column ").append(col).append(":");
        if (rowGroupIndex == null || (col >= rowGroupIndex.length) || ((index = rowGroupIndex[col]) == null)) {
            buf.append(" not found\n");
            return buf.toString();
        }

        for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
            buf.append("\n      Entry ").append(entryIx).append(": ");
            OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
            if (entry == null) {
                buf.append("unknown\n");
                continue;
            }
            OrcProto.ColumnStatistics colStats = entry.getStatistics();
            if (colStats == null) {
                buf.append("no stats at ");
            } else {
                ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
                buf.append(cs.toString());
            }
            buf.append(" positions: ");
            for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
                if (posIx != 0) {
                    buf.append(",");
                }
                buf.append(entry.getPositions(posIx));
            }
        }
        return buf.toString();
    }

    public static long getTotalPaddingSize(Reader reader) throws IOException {
        long paddedBytes = 0;
        List<StripeInformation> stripes = reader.getStripes();
        for (int i = 1; i < stripes.size(); i++) {
            long prevStripeOffset = stripes.get(i - 1).getOffset();
            long prevStripeLen = stripes.get(i - 1).getLength();
            paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
        }
        return paddedBytes;
    }

    @SuppressWarnings("static-access")
    static Options createOptions() {
        Options result = new Options();

        // add -d and --data to print the rows
        result.addOption(
                OptionBuilder.withLongOpt("data").withDescription("Should the data be printed").create('d'));

        // to avoid breaking unit tests (when run in different time zones) for file dump, printing
        // of timezone is made optional
        result.addOption(
                OptionBuilder.withLongOpt("timezone").withDescription("Print writer's time zone").create('t'));

        result.addOption(OptionBuilder.withLongOpt("help").withDescription("print help message").create('h'));

        result.addOption(OptionBuilder.withLongOpt("rowindex")
                .withArgName("comma separated list of column ids for which row index should be printed")
                .withDescription("Dump stats for column number(s)").hasArg().create('r'));

        result.addOption(
                OptionBuilder.withLongOpt("json").withDescription("Print metadata in JSON format").create('j'));

        result.addOption(OptionBuilder.withLongOpt("pretty").withDescription("Pretty print json metadata output")
                .create('p'));

        result.addOption(OptionBuilder.withLongOpt("recover")
                .withDescription("recover corrupted orc files generated by streaming").create());

        result.addOption(OptionBuilder.withLongOpt("skip-dump")
                .withDescription("used along with --recover to directly recover files without dumping").create());

        result.addOption(OptionBuilder.withLongOpt("backup-path")
                .withDescription("specify a backup path to store the corrupted files (default: /tmp)").hasArg()
                .create());
        return result;
    }

    private static void printMap(JSONWriter writer, MapColumnVector vector, TypeDescription schema, int row)
            throws JSONException {
        writer.array();
        TypeDescription keyType = schema.getChildren().get(0);
        TypeDescription valueType = schema.getChildren().get(1);
        int offset = (int) vector.offsets[row];
        for (int i = 0; i < vector.lengths[row]; ++i) {
            writer.object();
            writer.key("_key");
            printValue(writer, vector.keys, keyType, offset + i);
            writer.key("_value");
            printValue(writer, vector.values, valueType, offset + i);
            writer.endObject();
        }
        writer.endArray();
    }

    private static void printList(JSONWriter writer, ListColumnVector vector, TypeDescription schema, int row)
            throws JSONException {
        writer.array();
        int offset = (int) vector.offsets[row];
        TypeDescription childType = schema.getChildren().get(0);
        for (int i = 0; i < vector.lengths[row]; ++i) {
            printValue(writer, vector.child, childType, offset + i);
        }
        writer.endArray();
    }

    private static void printUnion(JSONWriter writer, UnionColumnVector vector, TypeDescription schema, int row)
            throws JSONException {
        int tag = vector.tags[row];
        printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
    }

    static void printStruct(JSONWriter writer, StructColumnVector batch, TypeDescription schema, int row)
            throws JSONException {
        writer.object();
        List<String> fieldNames = schema.getFieldNames();
        List<TypeDescription> fieldTypes = schema.getChildren();
        for (int i = 0; i < fieldTypes.size(); ++i) {
            writer.key(fieldNames.get(i));
            printValue(writer, batch.fields[i], fieldTypes.get(i), row);
        }
        writer.endObject();
    }

    static void printBinary(JSONWriter writer, BytesColumnVector vector, int row) throws JSONException {
        writer.array();
        int offset = vector.start[row];
        for (int i = 0; i < vector.length[row]; ++i) {
            writer.value(0xff & (int) vector.vector[row][offset + i]);
        }
        writer.endArray();
    }

    static void printValue(JSONWriter writer, ColumnVector vector, TypeDescription schema, int row)
            throws JSONException {
        if (vector.isRepeating) {
            row = 0;
        }
        if (vector.noNulls || !vector.isNull[row]) {
            switch (schema.getCategory()) {
            case BOOLEAN:
                writer.value(((LongColumnVector) vector).vector[row] != 0);
                break;
            case BYTE:
            case SHORT:
            case INT:
            case LONG:
                writer.value(((LongColumnVector) vector).vector[row]);
                break;
            case FLOAT:
            case DOUBLE:
                writer.value(((DoubleColumnVector) vector).vector[row]);
                break;
            case STRING:
            case CHAR:
            case VARCHAR:
                writer.value(((BytesColumnVector) vector).toString(row));
                break;
            case BINARY:
                printBinary(writer, (BytesColumnVector) vector, row);
                break;
            case DECIMAL:
                writer.value(((DecimalColumnVector) vector).vector[row].toString());
                break;
            case DATE:
                writer.value(new DateWritable((int) ((LongColumnVector) vector).vector[row]).toString());
                break;
            case TIMESTAMP:
                writer.value(((TimestampColumnVector) vector).asScratchTimestamp(row).toString());
                break;
            case LIST:
                printList(writer, (ListColumnVector) vector, schema, row);
                break;
            case MAP:
                printMap(writer, (MapColumnVector) vector, schema, row);
                break;
            case STRUCT:
                printStruct(writer, (StructColumnVector) vector, schema, row);
                break;
            case UNION:
                printUnion(writer, (UnionColumnVector) vector, schema, row);
                break;
            default:
                throw new IllegalArgumentException("Unknown type " + schema.toString());
            }
        } else {
            writer.value(null);
        }
    }

    static void printRow(JSONWriter writer, VectorizedRowBatch batch, TypeDescription schema, int row)
            throws JSONException {
        if (schema.getCategory() == TypeDescription.Category.STRUCT) {
            List<TypeDescription> fieldTypes = schema.getChildren();
            List<String> fieldNames = schema.getFieldNames();
            writer.object();
            for (int c = 0; c < batch.cols.length; ++c) {
                writer.key(fieldNames.get(c));
                printValue(writer, batch.cols[c], fieldTypes.get(c), row);
            }
            writer.endObject();
        } else {
            printValue(writer, batch.cols[0], schema, row);
        }
    }

    static void printJsonData(final Reader reader) throws IOException, JSONException {
        PrintStream printStream = System.out;
        OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
        RecordReader rows = reader.rows();
        try {
            TypeDescription schema = reader.getSchema();
            VectorizedRowBatch batch = schema.createRowBatch();
            while (rows.nextBatch(batch)) {
                for (int r = 0; r < batch.size; ++r) {
                    JSONWriter writer = new JSONWriter(out);
                    printRow(writer, batch, schema, r);
                    out.write("\n");
                    out.flush();
                    if (printStream.checkError()) {
                        throw new IOException("Error encountered when writing to stdout.");
                    }
                }
            }
        } finally {
            rows.close();
        }
    }
}