org.apache.accumulo.core.file.rfile.PrintInfo.java Source code

Introduction

Here is the source code for org.apache.accumulo.core.file.rfile.PrintInfo.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.core.file.rfile;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.accumulo.core.cli.Help;
import org.apache.accumulo.core.conf.DefaultConfiguration;
import org.apache.accumulo.core.conf.SiteConfiguration;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.file.FileSKVIterator;
import org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile;
import org.apache.accumulo.core.file.rfile.RFile.Reader;
import org.apache.accumulo.start.spi.KeywordExecutable;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.beust.jcommander.Parameter;
import com.google.auto.service.AutoService;

@AutoService(KeywordExecutable.class)
public class PrintInfo implements KeywordExecutable {

    private static final Logger log = LoggerFactory.getLogger(PrintInfo.class);

    static class Opts extends Help {
        @Parameter(names = { "-d", "--dump" }, description = "dump the key/value pairs")
        boolean dump = false;
        @Parameter(names = { "-v", "--vis" }, description = "show visibility metrics")
        boolean vis = false;
        @Parameter(names = { "--visHash" }, description = "show visibilities as hashes, implies -v")
        boolean hash = false;
        @Parameter(names = { "--histogram" }, description = "print a histogram of the key-value sizes")
        boolean histogram = false;
        @Parameter(names = {
                "--useSample" }, description = "Use sample data for --dump, --vis, --histogram options")
        boolean useSample = false;
        @Parameter(names = { "--keyStats" }, description = "print key length statistics for index and all data")
        boolean keyStats = false;
        @Parameter(description = " <file> { <file> ... }")
        List<String> files = new ArrayList<>();
        @Parameter(names = { "-c",
                "--config" }, variableArity = true, description = "Comma-separated Hadoop configuration files")
        List<String> configFiles = new ArrayList<>();
    }

    static class LogHistogram {
        long countBuckets[] = new long[11];
        long sizeBuckets[] = new long[countBuckets.length];
        long totalSize = 0;

        public void add(int size) {
            int bucket = (int) Math.log10(size);
            countBuckets[bucket]++;
            sizeBuckets[bucket] += size;
            totalSize += size;
        }

        public void print(String indent) {
            System.out.println(indent + "Up to size      count      %-age");
            for (int i = 1; i < countBuckets.length; i++) {
                System.out.println(String.format("%s%11.0f : %10d %6.2f%%", indent, Math.pow(10, i),
                        countBuckets[i], sizeBuckets[i] * 100. / totalSize));
            }
        }
    }

    static class KeyStats {
        private SummaryStatistics stats = new SummaryStatistics();
        private LogHistogram logHistogram = new LogHistogram();

        public void add(Key k) {
            int size = k.getSize();
            stats.addValue(size);
            logHistogram.add(size);
        }

        public void print(String indent) {
            logHistogram.print(indent);
            System.out.println();
            System.out.printf("%smin:%,11.2f max:%,11.2f avg:%,11.2f stddev:%,11.2f\n", indent, stats.getMin(),
                    stats.getMax(), stats.getMean(), stats.getStandardDeviation());
        }
    }

    public static void main(String[] args) throws Exception {
        new PrintInfo().execute(args);
    }

    @Override
    public String keyword() {
        return "rfile-info";
    }

    @Override
    public String description() {
        return "Prints rfile info";
    }

    @Override
    public void execute(final String[] args) throws Exception {
        Opts opts = new Opts();
        opts.parseArgs(PrintInfo.class.getName(), args);
        if (opts.files.isEmpty()) {
            System.err.println("No files were given");
            System.exit(-1);
        }

        Configuration conf = new Configuration();
        for (String confFile : opts.configFiles) {
            log.debug("Adding Hadoop configuration file " + confFile);
            conf.addResource(new Path(confFile));
        }

        FileSystem hadoopFs = FileSystem.get(conf);
        FileSystem localFs = FileSystem.getLocal(conf);

        LogHistogram kvHistogram = new LogHistogram();

        KeyStats dataKeyStats = new KeyStats();
        KeyStats indexKeyStats = new KeyStats();

        for (String arg : opts.files) {
            Path path = new Path(arg);
            FileSystem fs;
            if (arg.contains(":"))
                fs = path.getFileSystem(conf);
            else {
                log.warn("Attempting to find file across filesystems. Consider providing URI instead of path");
                fs = hadoopFs.exists(path) ? hadoopFs : localFs; // fall back to local
            }
            System.out.println(
                    "Reading file: " + path.makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString());

            CachableBlockFile.Reader _rdr = new CachableBlockFile.Reader(fs, path, conf, null, null,
                    SiteConfiguration.getInstance(DefaultConfiguration.getInstance()));
            Reader iter = new RFile.Reader(_rdr);
            MetricsGatherer<Map<String, ArrayList<VisibilityMetric>>> vmg = new VisMetricsGatherer();

            if (opts.vis || opts.hash)
                iter.registerMetrics(vmg);

            iter.printInfo();
            System.out.println();
            org.apache.accumulo.core.file.rfile.bcfile.PrintInfo.main(new String[] { arg });

            Map<String, ArrayList<ByteSequence>> localityGroupCF = null;

            if (opts.histogram || opts.dump || opts.vis || opts.hash || opts.keyStats) {
                localityGroupCF = iter.getLocalityGroupCF();

                FileSKVIterator dataIter;
                if (opts.useSample) {
                    dataIter = iter.getSample();

                    if (dataIter == null) {
                        System.out.println("ERROR : This rfile has no sample data");
                        return;
                    }
                } else {
                    dataIter = iter;
                }

                if (opts.keyStats) {
                    FileSKVIterator indexIter = iter.getIndex();
                    while (indexIter.hasTop()) {
                        indexKeyStats.add(indexIter.getTopKey());
                        indexIter.next();
                    }
                }

                for (Entry<String, ArrayList<ByteSequence>> cf : localityGroupCF.entrySet()) {

                    dataIter.seek(new Range((Key) null, (Key) null), cf.getValue(), true);
                    while (dataIter.hasTop()) {
                        Key key = dataIter.getTopKey();
                        Value value = dataIter.getTopValue();
                        if (opts.dump) {
                            System.out.println(key + " -> " + value);
                            if (System.out.checkError())
                                return;
                        }
                        if (opts.histogram) {
                            kvHistogram.add(key.getSize() + value.getSize());
                        }
                        if (opts.keyStats) {
                            dataKeyStats.add(key);
                        }
                        dataIter.next();
                    }
                }
            }

            iter.close();

            if (opts.vis || opts.hash) {
                System.out.println();
                vmg.printMetrics(opts.hash, "Visibility", System.out);
            }

            if (opts.histogram) {
                System.out.println();
                kvHistogram.print("");
            }

            if (opts.keyStats) {
                System.out.println();
                System.out.println("Statistics for keys in data :");
                dataKeyStats.print("\t");
                System.out.println();
                System.out.println("Statistics for keys in index :");
                indexKeyStats.print("\t");
            }
            // If the output stream has closed, there is no reason to keep going.
            if (System.out.checkError())
                return;
        }
    }
}