org.apache.parquet.cli.Util.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.parquet.cli.Util.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli;

import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import org.apache.commons.codec.binary.Hex;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import java.util.Set;

import static org.apache.parquet.column.Encoding.BIT_PACKED;
import static org.apache.parquet.column.Encoding.DELTA_BINARY_PACKED;
import static org.apache.parquet.column.Encoding.DELTA_BYTE_ARRAY;
import static org.apache.parquet.column.Encoding.PLAIN;
import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
import static org.apache.parquet.column.Encoding.RLE;
import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
import static org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY;

public class Util {

    private static final long KB = 1024;
    private static final long MB = 1024 * KB;
    private static final long GB = 1024 * MB;
    private static final long TB = 1024 * GB;

    public static String humanReadable(float bytes) {
        if (bytes > TB) {
            return String.format("%.03f TB", bytes / TB);
        } else if (bytes > GB) {
            return String.format("%.03f GB", bytes / GB);
        } else if (bytes > MB) {
            return String.format("%.03f MB", bytes / MB);
        } else if (bytes > KB) {
            return String.format("%.03f kB", bytes / KB);
        } else {
            return String.format("%.02f B", bytes);
        }
    }

    public static String humanReadable(long bytes) {
        if (bytes > TB) {
            return String.format("%.03f TB", ((float) bytes) / TB);
        } else if (bytes > GB) {
            return String.format("%.03f GB", ((float) bytes) / GB);
        } else if (bytes > MB) {
            return String.format("%.03f MB", ((float) bytes) / MB);
        } else if (bytes > KB) {
            return String.format("%.03f kB", ((float) bytes) / KB);
        } else {
            return String.format("%d B", bytes);
        }
    }

    public static String minMaxAsString(Statistics stats, OriginalType annotation) {
        if (stats == null) {
            return "no stats";
        }
        if (!stats.hasNonNullValue()) {
            return "";
        }
        return String.format("%s / %s", humanReadable(stats.minAsString(), 30),
                humanReadable(stats.maxAsString(), 30));
    }

    public static String toString(Statistics stats, long count, OriginalType annotation) {
        if (stats == null) {
            return "no stats";
        }
        return String.format("min: %s max: %s nulls: %d/%d", humanReadable(stats.minAsString(), 30),
                humanReadable(stats.maxAsString(), 30), stats.getNumNulls(), count);
    }

    public static String humanReadable(String str, int len) {
        if (str == null) {
            return "null";
        }

        StringBuilder sb = new StringBuilder();
        sb.append("\"");
        if (str.length() > len - 2) {
            sb.append(str.substring(0, len - 5)).append("...");
        } else {
            sb.append(str);
        }
        sb.append("\"");

        return sb.toString();
    }

    public static String humanReadable(byte[] bytes, int len) {
        if (bytes == null || bytes.length == 0) {
            return "null";
        }

        StringBuilder sb = new StringBuilder();
        String asString = Hex.encodeHexString(bytes);
        sb.append("0x");
        if (asString.length() > len - 2) {
            sb.append(asString.substring(0, (len - 5) / 2)).append("...");
        } else {
            sb.append(asString);
        }

        return sb.toString();
    }

    public static String shortCodec(CompressionCodecName codec) {
        switch (codec) {
        case UNCOMPRESSED:
            return "_";
        case SNAPPY:
            return "S";
        case GZIP:
            return "G";
        case LZO:
            return "L";
        case BROTLI:
            return "B";
        case LZ4:
            return "4";
        case ZSTD:
            return "Z";
        default:
            return "?";
        }
    }

    public static String encodingAsString(Encoding encoding, boolean isDict) {
        switch (encoding) {
        case PLAIN:
            return "_";
        case PLAIN_DICTIONARY:
            // data pages use RLE, dictionary pages use plain
            return isDict ? "_" : "R";
        case RLE_DICTIONARY:
            return "R";
        case DELTA_BINARY_PACKED:
        case DELTA_LENGTH_BYTE_ARRAY:
        case DELTA_BYTE_ARRAY:
            return "D";
        default:
            return "?";
        }
    }

    public static String encodingStatsAsString(EncodingStats encodingStats) {
        StringBuilder sb = new StringBuilder();
        if (encodingStats.hasDictionaryPages()) {
            for (Encoding encoding : encodingStats.getDictionaryEncodings()) {
                sb.append(encodingAsString(encoding, true));
            }
            sb.append(" ");
        } else {
            sb.append("  ");
        }

        Set<Encoding> encodings = encodingStats.getDataEncodings();
        if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
            sb.append("R");
        }
        if (encodings.contains(PLAIN)) {
            sb.append("_");
        }
        if (encodings.contains(DELTA_BYTE_ARRAY) || encodings.contains(DELTA_BINARY_PACKED)
                || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
            sb.append("D");
        }

        // Check for fallback and add a flag
        if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
            sb.append(" F");
        }

        return sb.toString();
    }

    public static String encodingsAsString(Set<Encoding> encodings, ColumnDescriptor desc) {
        StringBuilder sb = new StringBuilder();
        if (encodings.contains(RLE) || encodings.contains(BIT_PACKED)) {
            sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R");
            sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R");
            if (encodings.contains(PLAIN_DICTIONARY)) {
                sb.append("R");
            }
            if (encodings.contains(PLAIN)) {
                sb.append("_");
            }
        } else {
            sb.append("RR");
            if (encodings.contains(RLE_DICTIONARY)) {
                sb.append("R");
            }
            if (encodings.contains(PLAIN)) {
                sb.append("_");
            }
            if (encodings.contains(DELTA_BYTE_ARRAY) || encodings.contains(DELTA_BINARY_PACKED)
                    || encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
                sb.append("D");
            }
        }
        return sb.toString();
    }

    private static final Splitter DOT = Splitter.on('.');

    public static ColumnDescriptor descriptor(String column, MessageType schema) {
        String[] path = Iterables.toArray(DOT.split(column), String.class);
        Preconditions.checkArgument(schema.containsPath(path), "Schema doesn't have column: " + column);
        return schema.getColumnDescription(path);
    }

    public static String columnName(ColumnDescriptor desc) {
        return Joiner.on('.').join(desc.getPath());
    }

    public static PrimitiveType primitive(MessageType schema, String[] path) {
        Type current = schema;
        for (String part : path) {
            current = current.asGroupType().getType(part);
            if (current.isPrimitive()) {
                return current.asPrimitiveType();
            }
        }
        return null;
    }

    public static PrimitiveType primitive(String column, MessageType schema) {
        String[] path = Iterables.toArray(DOT.split(column), String.class);
        Preconditions.checkArgument(schema.containsPath(path), "Schema doesn't have column: " + column);
        return primitive(schema, path);
    }

}