org.apache.hadoop.hive.ql.exec.ComputationBalancerReducer.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.exec.ComputationBalancerReducer.java
Source

/**
* Tencent is pleased to support the open source community by making TDW available.
* Copyright (C) 2014 THL A29 Limited, a Tencent company. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use 
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed 
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
* OF ANY KIND, either express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore.get_database_args;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.SampleOperator;
import org.apache.hadoop.hive.ql.exec.StatsCollectionOperator;
import org.apache.hadoop.hive.ql.exec.Utilities.streamStatus;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.plan.forwardDesc;
import org.apache.hadoop.hive.ql.plan.groupByDesc;
import org.apache.hadoop.hive.ql.plan.histogramDesc;
import org.apache.hadoop.hive.ql.plan.sampleDesc;
import org.apache.hadoop.hive.ql.plan.statsDesc;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.hive.serde2.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.omg.CORBA._PolicyStub;

import com.facebook.fb303.FacebookService.getCounter_args;

import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.annotation.Documented;
import java.util.*;

import javax.crypto.spec.IvParameterSpec;

public class ComputationBalancerReducer {

    final String NUM_MCV_ATTR = "NumberOfMcv";
    final String NUM_BIN_ATTR = "NumberOfBin";
    public static final String CBR_TABLENAME_ATTR = "_TableName_";
    public static final String CBR_TABLEDATALOCATION_ATTR = "_TableDataLocation_";
    public static final String CBR_FLUSHFILEURI_ATTR = "_flushFile_uri_";

    int stat_num_records;
    int stat_num_units;
    int stat_sampled_counter;
    int _sampledRecordNumber_;

    long stat_total_size;
    long stat_num_files;
    long stat_num_blocks;

    int stat_null_counter;
    int stat_nullfac;
    int stat_avg_field_width;
    int stat_distinct_values;

    StringBuilder _hStringBuilder;
    String fldName;
    String funcName;
    String destURIString;

    String _finalTblName_;
    String _tableDataLocation_;

    int num_of_bins;
    FileSystem _fs;

    JobConf jc;
    Table table;
    Log LOG = LogFactory.getLog(this.getClass().getName());

    TreeMap<String, Integer> infoDict;
    TreeMap<String, String> epDict;
    TreeMap<String, TreeMap<String, Integer>> mcvList;
    HashMap<String, String> tblStructHashMap;

    ArrayList<Object> _testList;

    public ComputationBalancerReducer() {
        stat_num_records = 0;
        stat_num_units = 0;
        stat_sampled_counter = 0;

        stat_total_size = 0;
        stat_num_files = 0;
        stat_num_blocks = 0;

        stat_null_counter = 0;
        stat_nullfac = 0;
        stat_avg_field_width = 0;
        stat_distinct_values = 0;

        _sampledRecordNumber_ = 0;

        fldName = "";
        funcName = "";
        destURIString = "";
        _tableDataLocation_ = "";

        infoDict = new TreeMap<String, Integer>();
        mcvList = new TreeMap<String, TreeMap<String, Integer>>();
        epDict = new TreeMap<String, String>();
        tblStructHashMap = new HashMap<String, String>();
        _hStringBuilder = new StringBuilder();

    }

    public void setTableName(String cp) {
        _finalTblName_ = cp;
    }

    public String getTableName() {
        return _finalTblName_;
    }

    public void setTableDataLocation(String _copy) {
        this._tableDataLocation_ = _copy;
    }

    public String getTableDataLocation() {
        return this._tableDataLocation_;
    }

    public void setJobConf(JobConf para) {
        this.jc = para;
    }

    public void setDestURI(String para) {
        this.destURIString = para;
    }

    public String getDestURI() {
        return this.destURIString;
    }

    public void setNumberOfBins(int para) {
        this.num_of_bins = para;
    }

    public void reduce(BytesWritable wrappedKey, Text wrappedValue) {
        final String _key = ToolBox.getOriginalKey(wrappedKey);

        if (_key.startsWith(StatsCollectionOperator.FIELDLENGTH_ATTR)) {
            Integer iValue = Integer.valueOf(wrappedValue.toString());
            Integer _reg_ = infoDict.get(_key);
            if (_reg_ == null) {
                _reg_ = iValue;
            } else {
                _reg_ += iValue;
            }
            infoDict.put(_key, _reg_);
            LOG.debug("FieldLength:  " + _key + " " + _reg_);

        } else if (_key.startsWith(StatsCollectionOperator.NULLCOUNTER_ATTR)) {
            Integer iValue = Integer.valueOf(wrappedValue.toString());
            Integer _reg_ = infoDict.get(_key);
            if (_reg_ == null) {
                _reg_ = iValue;
            } else {
                _reg_ += iValue;
            }
            infoDict.put(_key, _reg_);
            LOG.debug("NullCounter:  " + _key + " " + _reg_);

        } else if (_key.startsWith(StatsCollectionOperator.RECORDSNUM_ATTR)) {
            Integer iValue = Integer.valueOf(wrappedValue.toString());
            stat_num_records += iValue;

        } else if (_key.startsWith(SampleOperator.SAMPLE_COUNTER_ATTR)) {
            Integer iValue = Integer.valueOf(wrappedValue.toString());
            _sampledRecordNumber_ += iValue;

        } else if (_key.startsWith(SampleOperator.SAMPLE_DATA_ATTR)) {
            _testList.add(wrappedValue);

        } else if (_key.startsWith(HistogramOperator.MCVLIST_ATTR)) {
            if (mcvList == null) {
                return;
            }
            {
                StringTokenizer _hst_ = new StringTokenizer(wrappedValue.toString(), ToolBox.hiveDelimiter);
                String _true_value_ = _hst_.nextToken();
                String _true_fre_ = _hst_.nextToken();
                TreeMap<String, Integer> _valfre_map_ = mcvList.get(_key);

                if (_valfre_map_ == null) {
                    _valfre_map_ = new TreeMap<String, Integer>();
                    _valfre_map_.put(_true_value_, Integer.valueOf(_true_fre_));
                    mcvList.put(_key, _valfre_map_);
                } else {
                    Integer _o_fre_ = _valfre_map_.get(_true_value_);
                    if (_o_fre_ == null) {
                        _o_fre_ = Integer.valueOf(0);
                    }
                    _o_fre_ += Integer.valueOf(_true_fre_);
                    _valfre_map_.put(_true_value_, _o_fre_);

                }

                if (_valfre_map_.keySet().size() > 512) {
                    ToolBox _tb = new ToolBox();
                    _tb.compact(_valfre_map_, ToolBox.SortMethod.DescendSort, Integer.valueOf(512));
                }

            }

        } else if (_key.startsWith(SampleOperator.STATISTICS_SAMPLING_FACTOR_ATTR)) {
            Integer ax = Integer.valueOf(wrappedKey.toString());
            stat_sampled_counter += ax;
        }

    }

    public void close() {

        try {
            if (stat_num_records == 0) {
                System.out.println("[ERROR] stat_num_records == 0, Abort.");
                LOG.error("[ERROR] stat_num_records == 0, Abort.");
                return;
            }
            System.out.println("The sample record counter is " + _sampledRecordNumber_);
            Path outPath = new Path(destURIString + java.util.UUID.randomUUID().toString());
            _fs = outPath.getFileSystem(jc);
            FSDataOutputStream _outputStream = _fs.create(outPath);

            _outputStream.writeBytes(this.getTableName() + "\n");

            flushTableStatsInfo(_outputStream);
            flushDistinctValue(_outputStream);
            flushMCVlist(_outputStream);
            flushHistogram(_outputStream);
            flushInfo_DFS(_outputStream);

            _outputStream.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    void flushMCVlist(FSDataOutputStream out) throws Exception {

        out.writeBytes("MCVList\n");
        for (String _iter_outside_ : mcvList.keySet()) {
            TreeMap<String, Integer> _TreeMap = mcvList.get(_iter_outside_);
            out.writeBytes(_iter_outside_ + "\n");
            ToolBox _tb = new ToolBox();

            for (String _s_ : _TreeMap.keySet()) {
                _tb.push(_s_, _TreeMap.get(_s_));
            }

            if (_TreeMap.keySet().size() > 256) {

                _tb.compact(_TreeMap, ToolBox.SortMethod.DescendSort, Integer.valueOf(256));
            } else {
                _tb.descendSort();
            }

            for (int idx = 0; idx < _tb.getCapacity(); idx++) {
                double _tmp_frac_ = ((double) _tb.getIntegeAtIdx(idx) / (double) _sampledRecordNumber_);
                out.writeBytes(
                        _iter_outside_ + ToolBox.hiveDelimiter + _tb.getStringAtIdx(idx) + ToolBox.hiveDelimiter
                                + _tb.getIntegeAtIdx(idx) + ToolBox.hiveDelimiter + _tmp_frac_ + "\n");
            }
        }

    }

    void flushHistogram(FSDataOutputStream out) throws Exception {
        out.writeBytes(HistogramOperator.HISTOGRAMTABLE + "\n");
        for (String _s : mcvList.keySet()) {
            out.writeBytes(_s + "\n");
            ToolBox _tb = new ToolBox();
            TreeMap<String, Integer> _tsi = mcvList.get(_s);
            for (String _s_inner_ : _tsi.keySet()) {
                _tb.push(_s_inner_, _tsi.get(_s_inner_));
            }
            ToolBox _copyBox = HistogramOperator.binning(_tb, 10);
            String _curString = null;
            String _preString = _copyBox.getStringAtIdx(0);
            int idx;
            for (idx = 1; idx < _copyBox.getCapacity(); idx++) {
                _curString = _copyBox.getStringAtIdx(idx);
                if (_curString.equals(_preString)) {
                    continue;
                } else {
                    out.writeBytes(_copyBox.getIntegeAtIdx(idx - 1) + ToolBox.hiveDelimiter + _s
                            + ToolBox.hiveDelimiter + _copyBox.getStringAtIdx(idx - 1) + "\n");

                    _preString = _curString;
                }
            }

            out.writeBytes(_copyBox.getIntegeAtIdx(idx - 1) + ToolBox.hiveDelimiter + _s + ToolBox.hiveDelimiter
                    + _copyBox.getStringAtIdx(idx - 1) + "\n");

        }

    }

    void flushDistinctValue(FSDataOutputStream out) throws Exception {
        out.writeBytes("DistinctValue" + "\n");
        for (String _s : mcvList.keySet()) {
            double _result = ToolBox.calDistincValue(mcvList.get(_s), stat_num_records);
            String _surfix = _s.substring(HistogramOperator.MCVLIST_ATTR.length());
            String _finalString = "DistinctValue" + _surfix;
            out.writeBytes(_finalString + ToolBox.hiveDelimiter + _result + "\n");

        }
    }

    void flushTableStatsInfo(FSDataOutputStream out) throws Exception {
        out.writeBytes("TableInformation\n");
        out.writeBytes("stat_num_records" + ToolBox.hiveDelimiter + stat_num_records + "\n");
        ArrayList<TreeMap<String, Integer>> _a = ToolBox.<Integer>aggregateKey(infoDict, ToolBox.hiveDelimiter, 2);

        LOG.debug("stat_num_records:  " + stat_num_records);

        for (TreeMap<String, Integer> treeMap : _a) {
            if (treeMap == null)
                continue;
            for (String _s : treeMap.keySet()) {
                out.writeBytes(_s + ToolBox.hiveDelimiter + treeMap.get(_s) + "\n");
                LOG.debug(_s + ToolBox.hiveDelimiter + treeMap.get(_s));
                if (_s.startsWith(StatsCollectionOperator.FIELDLENGTH_ATTR)) {
                    double _tmp = (double) treeMap.get(_s) / (double) stat_num_records;
                    String _avg = StatsCollectionOperator.AVGFIELDWIDTH
                            + _s.substring(StatsCollectionOperator.FIELDLENGTH_ATTR.length());
                    out.writeBytes(_avg + ToolBox.hiveDelimiter + (long) _tmp + "\n");
                }

                if (_s.startsWith(StatsCollectionOperator.NULLCOUNTER_ATTR)) {
                    double _tmp = (double) treeMap.get(_s) / (double) stat_num_records;
                    String _avg = StatsCollectionOperator.NULLFAC_ATTR
                            + _s.substring(StatsCollectionOperator.NULLCOUNTER_ATTR.length());
                    out.writeBytes(_avg + ToolBox.hiveDelimiter + _tmp + "\n");
                }
            }
        }
    }

    void getFileSystemStats(Path uri) throws IOException {
        FileStatus _ax = _fs.getFileStatus(uri);
        if (_ax.isDir()) {
            FileStatus[] _fsArr = _fs.listStatus(uri);
            for (FileStatus obj : _fsArr) {
                getFileSystemStats(obj.getPath());
            }
        } else {
            ContentSummary _cs = _fs.getContentSummary(uri);
            stat_num_files += _cs.getFileCount();
            stat_total_size += _cs.getLength();
            stat_num_blocks += _fs.getFileBlockLocations(_fs.getFileStatus(uri), 0, stat_total_size).length;

        }
    }

    void flushInfo_DFS(FSDataOutputStream _out) throws Exception {
        assert (_out != null);

        _out.writeBytes("DFSInformation\n");
        Path tablePath = new Path(_tableDataLocation_);
        stat_num_files = 0;
        stat_total_size = 0;
        stat_num_blocks = 0;
        getFileSystemStats(tablePath);

        _out.writeBytes("stat_total_size" + ToolBox.hiveDelimiter + _finalTblName_ + ToolBox.hiveDelimiter
                + stat_total_size + "\n");
        _out.writeBytes("stat_num_files" + ToolBox.hiveDelimiter + _finalTblName_ + ToolBox.hiveDelimiter
                + stat_num_files + "\n");
        _out.writeBytes("stat_num_blocks" + ToolBox.hiveDelimiter + _finalTblName_ + ToolBox.hiveDelimiter
                + stat_num_blocks + "\n");
    }

    void parseTableStruct() {

        String _hName_ = jc.get(ToolBox.TABLE_HEADER_NAMES_ATTR);
        if (_hName_ == null) {
            return;
        }
        StringTokenizer _stName_ = new StringTokenizer(_hName_, ToolBox.commaDelimiter);
        String _hType_ = jc.get(ToolBox.TABLE_HEADER_TYPES_ATTR);
        StringTokenizer _stType_ = new StringTokenizer(_hType_, ToolBox.commaDelimiter);
        while (_stName_.hasMoreTokens() && _stType_.hasMoreTokens()) {
            tblStructHashMap.put(_stName_.nextToken(), _stType_.nextToken());

        }
    }

}