org.apache.kylin.engine.mr.steps.CalculateStatsFromBaseCuboidMapper.java Source code

Introduction

Here is the source code for org.apache.kylin.engine.mr.steps.CalculateStatsFromBaseCuboidMapper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.engine.mr.steps;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.KylinVersion;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.cuboid.CuboidUtil;
import org.apache.kylin.cube.kv.RowKeyDecoder;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.hllc.HLLCounter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

public class CalculateStatsFromBaseCuboidMapper extends KylinMapper<Text, Text, Text, Text> {
    private static final Logger logger = LoggerFactory.getLogger(CalculateStatsFromBaseCuboidMapper.class);

    protected int nRowKey;
    protected long baseCuboidId;

    private int samplingPercentage;
    private int rowCount = 0;
    private long[] rowHashCodesLong = null;
    //about details of the new algorithm, please see KYLIN-2518
    private boolean isUsePutRowKeyToHllNewAlgorithm;

    private HLLCounter[] allCuboidsHLL = null;
    private Long[] cuboidIds;
    private Integer[][] allCuboidsBitSet = null;
    private HashFunction hf = null;

    RowKeyDecoder rowKeyDecoder;

    protected Text outputKey = new Text();
    protected Text outputValue = new Text();

    @Override
    protected void doSetup(Context context) throws IOException {
        Configuration conf = context.getConfiguration();
        HadoopUtil.setCurrentConfiguration(conf);
        KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

        String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
        CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment cubeSegment = cube.getSegmentById(conf.get(BatchConstants.CFG_CUBE_SEGMENT_ID));

        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        nRowKey = cubeDesc.getRowkey().getRowKeyColumns().length;

        String cuboidModeName = conf.get(BatchConstants.CFG_CUBOID_MODE);
        Set<Long> cuboidIdSet = cube.getCuboidsByMode(cuboidModeName);

        cuboidIds = cuboidIdSet.toArray(new Long[cuboidIdSet.size()]);
        allCuboidsBitSet = CuboidUtil.getCuboidBitSet(cuboidIds, nRowKey);

        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));

        allCuboidsHLL = new HLLCounter[cuboidIds.length];
        for (int i = 0; i < cuboidIds.length; i++) {
            allCuboidsHLL[i] = new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision());
        }

        //for KYLIN-2518 backward compatibility
        if (KylinVersion.isBefore200(cubeDesc.getVersion())) {
            isUsePutRowKeyToHllNewAlgorithm = false;
            hf = Hashing.murmur3_32();
            logger.info("Found KylinVersion : {}. Use old algorithm for cuboid sampling.", cubeDesc.getVersion());
        } else {
            isUsePutRowKeyToHllNewAlgorithm = true;
            rowHashCodesLong = new long[nRowKey];
            hf = Hashing.murmur3_128();
            logger.info(
                    "Found KylinVersion : {}. Use new algorithm for cuboid sampling. About the details of the new algorithm, please refer to KYLIN-2518",
                    cubeDesc.getVersion());
        }

        rowKeyDecoder = new RowKeyDecoder(cubeSegment);
    }

    @Override
    public void doMap(Text key, Text value, Context context) throws InterruptedException, IOException {
        long cuboidID = rowKeyDecoder.decode(key.getBytes());
        if (cuboidID != baseCuboidId) {
            return; // Skip data from cuboids which are not the base cuboid
        }

        List<String> keyValues = rowKeyDecoder.getValues();

        if (rowCount < samplingPercentage) {
            Preconditions.checkArgument(nRowKey == keyValues.size());

            String[] row = keyValues.toArray(new String[keyValues.size()]);
            if (isUsePutRowKeyToHllNewAlgorithm) {
                putRowKeyToHLLNew(row);
            } else {
                putRowKeyToHLLOld(row);
            }
        }

        if (++rowCount == 100)
            rowCount = 0;
    }

    public void putRowKeyToHLLOld(String[] row) {
        //generate hash for each row key column
        byte[][] rowHashCodes = new byte[nRowKey][];
        for (int i = 0; i < nRowKey; i++) {
            Hasher hc = hf.newHasher();
            String colValue = row[i];
            if (colValue != null) {
                rowHashCodes[i] = hc.putString(colValue).hash().asBytes();
            } else {
                rowHashCodes[i] = hc.putInt(0).hash().asBytes();
            }
        }

        // use the row key column hash to get a consolidated hash for each cuboid
        for (int i = 0; i < cuboidIds.length; i++) {
            Hasher hc = hf.newHasher();
            for (int position = 0; position < allCuboidsBitSet[i].length; position++) {
                hc.putBytes(rowHashCodes[allCuboidsBitSet[i][position]]);
            }

            allCuboidsHLL[i].add(hc.hash().asBytes());
        }
    }

    private void putRowKeyToHLLNew(String[] row) {
        //generate hash for each row key column
        for (int i = 0; i < nRowKey; i++) {
            Hasher hc = hf.newHasher();
            String colValue = row[i];
            if (colValue == null)
                colValue = "0";
            byte[] bytes = hc.putString(colValue).hash().asBytes();
            rowHashCodesLong[i] = (Bytes.toLong(bytes) + i);//add column ordinal to the hash value to distinguish between (a,b) and (b,a)
        }

        // user the row key column hash to get a consolidated hash for each cuboid
        for (int i = 0, n = allCuboidsBitSet.length; i < n; i++) {
            long value = 0;
            for (int position = 0; position < allCuboidsBitSet[i].length; position++) {
                value += rowHashCodesLong[allCuboidsBitSet[i][position]];
            }
            allCuboidsHLL[i].addHashDirectly(value);
        }
    }

    @Override
    protected void doCleanup(Context context) throws IOException, InterruptedException {
        ByteBuffer hllBuf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
        HLLCounter hll;
        for (int i = 0; i < cuboidIds.length; i++) {
            hll = allCuboidsHLL[i];

            outputKey.set(Bytes.toBytes(cuboidIds[i]));
            hllBuf.clear();
            hll.writeRegisters(hllBuf);
            outputValue.set(hllBuf.array(), 0, hllBuf.position());
            context.write(outputKey, outputValue);
        }
    }
}