org.apache.kylin.engine.mr.steps.MergeCuboidMapper.java Source code

Introduction

Here is the source code for org.apache.kylin.engine.mr.steps.MergeCuboidMapper.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.engine.mr.steps;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.BytesUtil;
import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.common.util.Pair;
import org.apache.kylin.common.util.SplittedBytes;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.common.RowKeySplitter;
import org.apache.kylin.cube.cuboid.Cuboid;
import org.apache.kylin.cube.kv.RowConstants;
import org.apache.kylin.cube.kv.RowKeyEncoder;
import org.apache.kylin.cube.kv.RowKeyEncoderProvider;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.dict.DictionaryManager;
import org.apache.kylin.engine.mr.IMROutput2;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureIngester;
import org.apache.kylin.measure.MeasureType;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.apache.kylin.metadata.model.TableRef;
import org.apache.kylin.metadata.model.TblColRef;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * @author ysong1, honma
 */
@SuppressWarnings({ "rawtypes", "unchecked" })
public class MergeCuboidMapper extends KylinMapper<Text, Text, Text, Text> {

    private KylinConfig config;
    private String cubeName;
    private String segmentID;
    private CubeManager cubeManager;
    private CubeInstance cube;
    private CubeDesc cubeDesc;
    private CubeSegment mergedCubeSegment;
    private CubeSegment sourceCubeSegment; // Must be unique during a mapper's life cycle

    private Text outputKey = new Text();

    private byte[] newKeyBodyBuf;
    private ByteArray newKeyBuf;
    private RowKeySplitter rowKeySplitter;
    private RowKeyEncoderProvider rowKeyEncoderProvider;

    private HashMap<TblColRef, Boolean> dimensionsNeedDict = new HashMap<TblColRef, Boolean>();

    // for re-encode measures that use dictionary
    private List<Pair<Integer, MeasureIngester>> dictMeasures;
    private Map<TblColRef, Dictionary<String>> oldDicts;
    private Map<TblColRef, Dictionary<String>> newDicts;
    private List<MeasureDesc> measureDescs;
    private BufferedMeasureCodec codec;
    private Object[] measureObjs;
    private Text outputValue;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.bindCurrentConfiguration(context.getConfiguration());

        cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase();
        segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

        config = AbstractHadoopJob.loadKylinPropsAndMetadata();

        cubeManager = CubeManager.getInstance(config);
        cube = cubeManager.getCube(cubeName);
        cubeDesc = cube.getDescriptor();
        mergedCubeSegment = cube.getSegmentById(segmentID);

        // int colCount = cubeDesc.getRowkey().getRowKeyColumns().length;
        newKeyBodyBuf = new byte[RowConstants.ROWKEY_BUFFER_SIZE];// size will auto-grow
        newKeyBuf = ByteArray.allocate(RowConstants.ROWKEY_BUFFER_SIZE);

        // decide which source segment
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(mergedCubeSegment)
                .getOuputFormat();
        sourceCubeSegment = outputFormat.findSourceSegment(fileSplit, cube);

        rowKeySplitter = new RowKeySplitter(sourceCubeSegment, 65, 255);
        rowKeyEncoderProvider = new RowKeyEncoderProvider(mergedCubeSegment);

        measureDescs = cubeDesc.getMeasures();
        codec = new BufferedMeasureCodec(measureDescs);
        measureObjs = new Object[measureDescs.size()];
        outputValue = new Text();

        dictMeasures = Lists.newArrayList();
        oldDicts = Maps.newHashMap();
        newDicts = Maps.newHashMap();
        for (int i = 0; i < measureDescs.size(); i++) {
            MeasureDesc measureDesc = measureDescs.get(i);
            MeasureType measureType = measureDesc.getFunction().getMeasureType();
            List<TblColRef> columns = measureType.getColumnsNeedDictionary(measureDesc.getFunction());
            boolean needReEncode = false;
            for (TblColRef col : columns) {
                //handle the column that all records is null
                if (sourceCubeSegment.getDictionary(col) == null) {
                    continue;
                }

                if (!sourceCubeSegment.getDictionary(col).equals(mergedCubeSegment.getDictionary(col))) {
                    oldDicts.put(col, sourceCubeSegment.getDictionary(col));
                    newDicts.put(col, mergedCubeSegment.getDictionary(col));
                    needReEncode = true;
                }
            }
            if (needReEncode) {
                dictMeasures.add(Pair.newPair(i, measureType.newIngester()));
            }
        }
    }

    @Override
    public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException {
        long cuboidID = rowKeySplitter.split(key.getBytes());
        Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID);
        RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(cuboid);

        SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers();
        int bufOffset = 0;
        int bodySplitOffset = rowKeySplitter.getBodySplitOffset();

        for (int i = 0; i < cuboid.getColumns().size(); ++i) {
            int useSplit = i + bodySplitOffset;
            TblColRef col = cuboid.getColumns().get(i);

            if (this.checkNeedMerging(col)) {
                // if dictionary on fact table column, needs rewrite
                DictionaryManager dictMgr = DictionaryManager.getInstance(config);
                Dictionary<String> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col));

                Dictionary<String> sourceDict;
                // handle the column that all records is null
                if (sourceCubeSegment.getDictionary(col) == null) {
                    BytesUtil.writeUnsigned(mergedDict.nullId(), newKeyBodyBuf, bufOffset,
                            mergedDict.getSizeOfId());
                    bufOffset += mergedDict.getSizeOfId();
                    continue;
                } else {
                    sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col));
                }

                while (sourceDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
                        mergedDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
                        mergedDict.getSizeOfId() > newKeyBodyBuf.length - bufOffset) {
                    byte[] oldBuf = newKeyBodyBuf;
                    newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
                    System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
                }

                int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[useSplit].value, 0,
                        splittedByteses[useSplit].length);
                int idInMergedDict;

                //int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBodyBuf, bufOffset);
                String v = sourceDict.getValueFromId(idInSourceDict);
                if (v == null) {
                    idInMergedDict = mergedDict.nullId();
                } else {
                    idInMergedDict = mergedDict.getIdFromValue(v);
                }

                BytesUtil.writeUnsigned(idInMergedDict, newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId());
                bufOffset += mergedDict.getSizeOfId();
            } else {
                // keep as it is
                while (splittedByteses[useSplit].length > newKeyBodyBuf.length - bufOffset) {
                    byte[] oldBuf = newKeyBodyBuf;
                    newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
                    System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
                }

                System.arraycopy(splittedByteses[useSplit].value, 0, newKeyBodyBuf, bufOffset,
                        splittedByteses[useSplit].length);
                bufOffset += splittedByteses[useSplit].length;
            }
        }

        int fullKeySize = rowkeyEncoder.getBytesLength();
        while (newKeyBuf.array().length < fullKeySize) {
            newKeyBuf.set(new byte[newKeyBuf.length() * 2]);
        }
        newKeyBuf.set(0, fullKeySize);

        rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, bufOffset), newKeyBuf);
        outputKey.set(newKeyBuf.array(), 0, fullKeySize);

        // re-encode measures if dictionary is used
        if (dictMeasures.size() > 0) {
            codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs);
            for (Pair<Integer, MeasureIngester> pair : dictMeasures) {
                int i = pair.getFirst();
                MeasureIngester ingester = pair.getSecond();
                measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts,
                        newDicts);
            }
            ByteBuffer valueBuf = codec.encode(measureObjs);
            outputValue.set(valueBuf.array(), 0, valueBuf.position());
            value = outputValue;
        }

        context.write(outputKey, value);
    }

    private Boolean checkNeedMerging(TblColRef col) throws IOException {
        Boolean ret = dimensionsNeedDict.get(col);
        if (ret != null)
            return ret;

        ret = cubeDesc.getRowkey().isUseDictionary(col);
        if (ret) {
            TableRef srcTable = DictionaryManager.getInstance(config).decideSourceData(cubeDesc.getModel(), col)
                    .getTableRef();
            ret = cubeDesc.getModel().isFactTable(srcTable);
        }

        dimensionsNeedDict.put(col, ret);
        return ret;
    }
}