org.apache.hadoop.hive.ql.stats.BasicStatsNoJobTask.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.stats.BasicStatsNoJobTask.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.stats;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.HiveStatsUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec;
import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.ReflectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimaps;

/**
 * StatsNoJobTask is used in cases where stats collection is the only task for the given query (no
 * parent MR or Tez job). It is used in the following cases 1) ANALYZE with noscan for
 * file formats that implement StatsProvidingRecordReader interface: ORC format (implements
 * StatsProvidingRecordReader) stores column statistics for all columns in the file footer. Its much
 * faster to compute the table/partition statistics by reading the footer than scanning all the
 * rows. This task can be used for computing basic stats like numFiles, numRows, fileSize,
 * rawDataSize from ORC footer.
 * However, this cannot be used for full ACID tables, since some of the files may contain updates
 * and deletes to existing rows, so summing up the per-file row counts is invalid.
 **/
public class BasicStatsNoJobTask implements IStatsProcessor {

    private static transient final Logger LOG = LoggerFactory.getLogger(BasicStatsNoJobTask.class);
    private HiveConf conf;

    private BasicStatsNoJobWork work;
    private LogHelper console;

    public BasicStatsNoJobTask(HiveConf conf, BasicStatsNoJobWork work) {
        this.conf = conf;
        this.work = work;
        console = new LogHelper(LOG);
    }

    public static boolean canUseFooterScan(Table table, Class<? extends InputFormat> inputFormat) {
        return (OrcInputFormat.class.isAssignableFrom(inputFormat) && !AcidUtils.isFullAcidTable(table))
                || MapredParquetInputFormat.class.isAssignableFrom(inputFormat);
    }

    @Override
    public void initialize(CompilationOpContext opContext) {

    }

    @Override
    public int process(Hive db, Table tbl) throws Exception {

        LOG.info("Executing stats (no job) task");

        ExecutorService threadPool = StatsTask.newThreadPool(conf);

        return aggregateStats(threadPool, db);
    }

    public StageType getType() {
        return StageType.STATS;
    }

    public String getName() {
        return "STATS-NO-JOB";
    }

    static class StatItem {
        Partish partish;
        Map<String, String> params;
        Object result;
    }

    static class FooterStatCollector implements Runnable {

        private Partish partish;
        private Object result;
        private JobConf jc;
        private Path dir;
        private FileSystem fs;
        private LogHelper console;

        public FooterStatCollector(JobConf jc, Partish partish) {
            this.jc = jc;
            this.partish = partish;
        }

        public static final Function<FooterStatCollector, String> SIMPLE_NAME_FUNCTION = new Function<FooterStatCollector, String>() {

            @Override
            public String apply(FooterStatCollector sc) {
                return String.format("%s#%s", sc.partish.getTable().getCompleteName(), sc.partish.getPartishType());
            }
        };
        private static final Function<FooterStatCollector, Partition> EXTRACT_RESULT_FUNCTION = new Function<FooterStatCollector, Partition>() {
            @Override
            public Partition apply(FooterStatCollector input) {
                return (Partition) input.result;
            }
        };

        private boolean isValid() {
            return result != null;
        }

        public void init(HiveConf conf, LogHelper console) throws IOException {
            this.console = console;
            dir = new Path(partish.getPartSd().getLocation());
            fs = dir.getFileSystem(conf);
        }

        @Override
        public void run() {

            Map<String, String> parameters = partish.getPartParameters();
            try {
                long numRows = 0;
                long rawDataSize = 0;
                long fileSize = 0;
                long numFiles = 0;
                long numErasureCodedFiles = 0;
                // Note: this code would be invalid for transactional tables of any kind.
                Utilities.FILE_OP_LOGGER.debug("Aggregating stats for {}", dir);
                List<FileStatus> fileList = null;
                if (partish.getTable() != null && AcidUtils.isTransactionalTable(partish.getTable())) {
                    fileList = AcidUtils.getAcidFilesForStats(partish.getTable(), dir, jc, fs);
                } else {
                    fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
                }

                for (FileStatus file : fileList) {
                    Utilities.FILE_OP_LOGGER.debug("Computing stats for {}", file);
                    if (!file.isDirectory()) {
                        InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(partish.getInputFormatClass(),
                                jc);
                        InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0,
                                new String[] { partish.getLocation() });
                        if (file.getLen() == 0) {
                            numFiles += 1;
                        } else {
                            org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat
                                    .getRecordReader(dummySplit, jc, Reporter.NULL);
                            try {
                                if (recordReader instanceof StatsProvidingRecordReader) {
                                    StatsProvidingRecordReader statsRR;
                                    statsRR = (StatsProvidingRecordReader) recordReader;
                                    rawDataSize += statsRR.getStats().getRawDataSize();
                                    numRows += statsRR.getStats().getRowCount();
                                    fileSize += file.getLen();
                                    numFiles += 1;
                                    if (file.isErasureCoded()) {
                                        numErasureCodedFiles++;
                                    }
                                } else {
                                    throw new HiveException(String
                                            .format("Unexpected file found during reading footers for: %s ", file));
                                }
                            } finally {
                                recordReader.close();
                            }
                        }
                    }
                }

                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);

                parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
                parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
                parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
                parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
                parameters.put(StatsSetupConst.NUM_ERASURE_CODED_FILES, String.valueOf(numErasureCodedFiles));

                if (partish.getPartition() != null) {
                    result = new Partition(partish.getTable(), partish.getPartition().getTPartition());
                } else {
                    result = new Table(partish.getTable().getTTable());
                }

                String msg = partish.getSimpleName() + " stats: [" + toString(parameters) + ']';
                LOG.debug(msg);
                console.printInfo(msg);

            } catch (Exception e) {
                console.printInfo("[Warning] could not update stats for " + partish.getSimpleName() + ".",
                        "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            }
        }

        private String toString(Map<String, String> parameters) {
            StringBuilder builder = new StringBuilder();
            for (String statType : StatsSetupConst.SUPPORTED_STATS) {
                String value = parameters.get(statType);
                if (value != null) {
                    if (builder.length() > 0) {
                        builder.append(", ");
                    }
                    builder.append(statType).append('=').append(value);
                }
            }
            return builder.toString();
        }

    }

    private int aggregateStats(ExecutorService threadPool, Hive db) {
        int ret = 0;
        try {
            JobConf jc = new JobConf(conf);

            TableSpec tableSpecs = work.getTableSpecs();

            if (tableSpecs == null) {
                throw new RuntimeException("this is unexpected...needs some investigation");
            }

            Table table = tableSpecs.tableHandle;

            Collection<Partition> partitions = null;
            if (work.getPartitions() == null || work.getPartitions().isEmpty()) {
                if (table.isPartitioned()) {
                    partitions = tableSpecs.partitions;
                }
            } else {
                partitions = work.getPartitions();
            }

            LinkedList<Partish> partishes = Lists.newLinkedList();
            if (partitions == null) {
                partishes.add(Partish.buildFor(table));
            } else {
                for (Partition part : partitions) {
                    partishes.add(Partish.buildFor(table, part));
                }
            }

            List<FooterStatCollector> scs = Lists.newArrayList();
            for (Partish partish : partishes) {
                scs.add(new FooterStatCollector(jc, partish));
            }

            for (FooterStatCollector sc : scs) {
                sc.init(conf, console);
                threadPool.execute(sc);
            }

            LOG.debug("Stats collection waiting for threadpool to shutdown..");
            shutdownAndAwaitTermination(threadPool);
            LOG.debug("Stats collection threadpool shutdown successful.");

            ret = updatePartitions(db, scs, table);

        } catch (Exception e) {
            console.printError("Failed to collect footer statistics.",
                    "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            // Fail the query if the stats are supposed to be reliable
            if (work.isStatsReliable()) {
                ret = -1;
            }
        }

        // The return value of 0 indicates success,
        // anything else indicates failure
        return ret;
    }

    private int updatePartitions(Hive db, List<FooterStatCollector> scs, Table table)
            throws InvalidOperationException, HiveException {

        String tableFullName = table.getFullyQualifiedName();

        if (scs.isEmpty()) {
            return 0;
        }
        if (work.isStatsReliable()) {
            for (FooterStatCollector statsCollection : scs) {
                if (statsCollection.result == null) {
                    LOG.debug("Stats requested to be reliable. Empty stats found: {}",
                            statsCollection.partish.getSimpleName());
                    return -1;
                }
            }
        }
        List<FooterStatCollector> validColectors = Lists.newArrayList();
        for (FooterStatCollector statsCollection : scs) {
            if (statsCollection.isValid()) {
                validColectors.add(statsCollection);
            }
        }

        EnvironmentContext environmentContext = new EnvironmentContext();
        environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);

        ImmutableListMultimap<String, FooterStatCollector> collectorsByTable = Multimaps.index(validColectors,
                FooterStatCollector.SIMPLE_NAME_FUNCTION);

        LOG.debug("Collectors.size(): {}", collectorsByTable.keySet());

        if (collectorsByTable.keySet().size() < 1) {
            LOG.warn("Collectors are empty! ; {}", tableFullName);
        }

        // for now this should be true...
        assert (collectorsByTable.keySet().size() <= 1);

        LOG.debug("Updating stats for: {}", tableFullName);

        for (String partName : collectorsByTable.keySet()) {
            ImmutableList<FooterStatCollector> values = collectorsByTable.get(partName);

            if (values == null) {
                throw new RuntimeException("very intresting");
            }

            if (values.get(0).result instanceof Table) {
                db.alterTable(tableFullName, (Table) values.get(0).result, environmentContext, true);
                LOG.debug("Updated stats for {}.", tableFullName);
            } else {
                if (values.get(0).result instanceof Partition) {
                    List<Partition> results = Lists.transform(values, FooterStatCollector.EXTRACT_RESULT_FUNCTION);
                    db.alterPartitions(tableFullName, results, environmentContext, true);
                    LOG.debug("Bulk updated {} partitions of {}.", results.size(), tableFullName);
                } else {
                    throw new RuntimeException("inconsistent");
                }
            }
        }
        LOG.debug("Updated stats for: {}", tableFullName);
        return 0;
    }

    private void shutdownAndAwaitTermination(ExecutorService threadPool) {

        // Disable new tasks from being submitted
        threadPool.shutdown();
        try {

            // Wait a while for existing tasks to terminate
            // XXX this will wait forever... :)
            while (!threadPool.awaitTermination(10, TimeUnit.SECONDS)) {
                LOG.debug("Waiting for all stats tasks to finish...");
            }
            // Cancel currently executing tasks
            threadPool.shutdownNow();

            // Wait a while for tasks to respond to being cancelled
            if (!threadPool.awaitTermination(100, TimeUnit.SECONDS)) {
                LOG.debug("Stats collection thread pool did not terminate");
            }
        } catch (InterruptedException ie) {

            // Cancel again if current thread also interrupted
            threadPool.shutdownNow();

            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }
    }

    @Override
    public void setDpPartSpecs(Collection<Partition> dpPartSpecs) {
    }
}