com.kylinolap.dict.DictionaryManager.java Source code

Java tutorial

Introduction

Here is the source code for com.kylinolap.dict.DictionaryManager.java

Source

/*
 * Copyright 2013-2014 eBay Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.kylinolap.dict;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.compress.utils.IOUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.kylinolap.common.KylinConfig;
import com.kylinolap.common.persistence.ResourceStore;
import com.kylinolap.common.util.HadoopUtil;
import com.kylinolap.dict.lookup.FileTable;
import com.kylinolap.dict.lookup.HiveTable;
import com.kylinolap.dict.lookup.ReadableTable;
import com.kylinolap.dict.lookup.TableSignature;
import com.kylinolap.metadata.MetadataManager;
import com.kylinolap.metadata.model.cube.CubeDesc;
import com.kylinolap.metadata.model.cube.TblColRef;

public class DictionaryManager {

    private static final Logger logger = LoggerFactory.getLogger(DictionaryManager.class);

    private static final DictionaryInfo NONE_INDICATOR = new DictionaryInfo();

    // static cached instances
    private static final ConcurrentHashMap<KylinConfig, DictionaryManager> SERVICE_CACHE = new ConcurrentHashMap<KylinConfig, DictionaryManager>();

    public static DictionaryManager getInstance(KylinConfig config) {
        DictionaryManager r = SERVICE_CACHE.get(config);
        if (r == null) {
            r = new DictionaryManager(config);
            SERVICE_CACHE.put(config, r);
        }
        return r;
    }

    public static void removeInstance(KylinConfig config) {
        SERVICE_CACHE.remove(config);
    }

    // ============================================================================

    private KylinConfig config;
    private ConcurrentHashMap<String, DictionaryInfo> dictCache; // resource
                                                                 // path ==>
                                                                 // DictionaryInfo

    private DictionaryManager(KylinConfig config) {
        this.config = config;
        dictCache = new ConcurrentHashMap<String, DictionaryInfo>();
    }

    public Dictionary<?> getDictionary(String resourcePath) throws IOException {
        DictionaryInfo dictInfo = getDictionaryInfo(resourcePath);
        return dictInfo == null ? null : dictInfo.getDictionaryObject();
    }

    public DictionaryInfo getDictionaryInfo(String resourcePath) throws IOException {
        DictionaryInfo dictInfo = dictCache.get(resourcePath);
        if (dictInfo == null) {
            dictInfo = load(resourcePath, true);
            if (dictInfo == null)
                dictInfo = NONE_INDICATOR;
            dictCache.put(resourcePath, dictInfo);
        }
        return dictInfo == NONE_INDICATOR ? null : dictInfo;
    }

    public DictionaryInfo trySaveNewDict(Dictionary<?> newDict, DictionaryInfo newDictInfo) throws IOException {

        String dupDict = checkDupByContent(newDictInfo, newDict);
        if (dupDict != null) {
            logger.info("Identical dictionary content " + newDict + ", reuse existing dictionary at " + dupDict);
            return getDictionaryInfo(dupDict);
        }

        newDictInfo.setDictionaryObject(newDict);
        newDictInfo.setDictionaryClass(newDict.getClass().getName());

        save(newDictInfo);

        dictCache.put(newDictInfo.getResourcePath(), newDictInfo);

        return newDictInfo;
    }

    public DictionaryInfo mergeDictionary(List<DictionaryInfo> dicts) throws IOException {
        DictionaryInfo firstDictInfo = null;
        int totalSize = 0;
        for (DictionaryInfo info : dicts) {
            // check
            if (firstDictInfo == null) {
                firstDictInfo = info;
            } else {
                if (!firstDictInfo.isDictOnSameColumn(info)) {
                    throw new IllegalArgumentException(
                            "Merging dictionaries are not structurally equal(regardless of signature).");
                }
            }
            totalSize += info.getInput().getSize();
        }

        if (firstDictInfo == null) {
            throw new IllegalArgumentException("DictionaryManager.mergeDictionary input cannot be null");
        }

        DictionaryInfo newDictInfo = new DictionaryInfo(firstDictInfo);
        TableSignature signature = newDictInfo.getInput();
        signature.setSize(totalSize);
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath("merged_with_no_original_path");

        String dupDict = checkDupByInfo(newDictInfo);
        if (dupDict != null) {
            logger.info("Identical dictionary input " + newDictInfo.getInput() + ", reuse existing dictionary at "
                    + dupDict);
            return getDictionaryInfo(dupDict);
        }

        Dictionary<?> newDict = DictionaryGenerator.mergeDictionaries(newDictInfo, dicts);

        return trySaveNewDict(newDict, newDictInfo);
    }

    public DictionaryInfo buildDictionary(CubeDesc cube, TblColRef col, String factColumnsPath) throws IOException {

        Object[] tmp = decideSourceData(cube, col, factColumnsPath);
        String srcTable = (String) tmp[0];
        String srcCol = (String) tmp[1];
        int srcColIdx = (Integer) tmp[2];
        ReadableTable inpTable = (ReadableTable) tmp[3];

        DictionaryInfo dictInfo = new DictionaryInfo(srcTable, srcCol, srcColIdx, col.getDatatype(),
                inpTable.getSignature(), inpTable.getColumnDelimeter());

        String dupDict = checkDupByInfo(dictInfo);
        if (dupDict != null) {
            logger.info("Identical dictionary input " + dictInfo.getInput() + ", reuse existing dictionary at "
                    + dupDict);
            return getDictionaryInfo(dupDict);
        }

        Dictionary<?> dict = DictionaryGenerator.buildDictionary(dictInfo, inpTable);

        return trySaveNewDict(dict, dictInfo);
    }

    public Object[] decideSourceData(CubeDesc cube, TblColRef col, String factColumnsPath) throws IOException {
        String srcTable;
        String srcCol;
        int srcColIdx;
        ReadableTable table;
        MetadataManager metaMgr = MetadataManager.getInstance(config);

        if (cube == null) { // case of full table
            srcTable = col.getTable();
            srcCol = col.getName();
            srcColIdx = col.getColumn().getZeroBasedIndex();
            int nColumns = metaMgr.getTableDesc(col.getTable()).getColumnCount();
            table = new FileTable(factColumnsPath + "/" + col.getName(), nColumns);
            return new Object[] { srcTable, srcCol, srcColIdx, table };
        }

        // Decide source data of dictionary:
        // 1. If 'useDict' specifies pre-defined data set, use that
        // 2. Otherwise find a lookup table to scan through

        // Note normal column on fact table is not supported due to the size of
        // fact table
        // Note FK on fact table is supported by scan the related PK on lookup
        // table

        String useDict = cube.getRowkey().getDictionary(col);

        // normal case, source from lookup table
        if ("true".equals(useDict) || "string".equals(useDict) || "number".equals(useDict)
                || "any".equals(useDict)) {
            // FK on fact table, use PK from lookup instead
            if (cube.isFactTable(col.getTable())) {
                TblColRef pkCol = cube.findPKByFK(col);
                if (pkCol != null)
                    col = pkCol; // scan the counterparty PK on lookup table
                                 // instead
            }
            srcTable = col.getTable();
            srcCol = col.getName();
            srcColIdx = col.getColumn().getZeroBasedIndex();
            if (cube.isFactTable(col.getTable())) {
                table = new FileTable(factColumnsPath + "/" + col.getName(), -1);
            } else {
                table = new HiveTable(metaMgr, col.getTable());
            }
        }
        // otherwise could refer to a data set, e.g. common_indicators.txt
        // (LEGACY PATH, since distinct values are collected from fact table)
        else {
            String dictDataSetPath = unpackDataSet(this.config.getTempHDFSDir(), useDict);
            if (dictDataSetPath == null)
                throw new IllegalArgumentException(
                        "Unknown dictionary data set '" + useDict + "', referred from " + col);
            srcTable = "PREDEFINED";
            srcCol = useDict;
            srcColIdx = 0;
            table = new FileTable(dictDataSetPath, -1);
        }

        return new Object[] { srcTable, srcCol, srcColIdx, table };
    }

    private String unpackDataSet(String tempHDFSDir, String dataSetName) throws IOException {

        InputStream in = this.getClass().getResourceAsStream("/com/kylinolap/dict/" + dataSetName + ".txt");
        if (in == null) // data set resource not found
            return null;

        ByteArrayOutputStream buf = new ByteArrayOutputStream();
        IOUtils.copy(in, buf);
        in.close();
        byte[] bytes = buf.toByteArray();

        Path tmpDataSetPath = new Path(
                tempHDFSDir + "/dict/temp_dataset/" + dataSetName + "_" + bytes.length + ".txt");

        FileSystem fs = HadoopUtil.getFileSystem(tempHDFSDir);
        boolean writtenNewFile = false;
        if (fs.exists(tmpDataSetPath) == false || fs.getFileStatus(tmpDataSetPath).getLen() != bytes.length) {
            fs.mkdirs(tmpDataSetPath.getParent());
            FSDataOutputStream out = fs.create(tmpDataSetPath);
            IOUtils.copy(new ByteArrayInputStream(bytes), out);
            out.close();
            writtenNewFile = true;
        }

        String qualifiedPath = tmpDataSetPath.makeQualified(fs.getUri(), new Path("/")).toString();
        if (writtenNewFile)
            logger.info("Dictionary temp data set file written to " + qualifiedPath);
        return qualifiedPath;
    }

    private String checkDupByInfo(DictionaryInfo dictInfo) throws IOException {
        ResourceStore store = MetadataManager.getInstance(config).getStore();
        ArrayList<String> existings = store.listResources(dictInfo.getResourceDir());
        if (existings == null)
            return null;

        TableSignature input = dictInfo.getInput();
        for (String existing : existings) {
            DictionaryInfo existingInfo = load(existing, false); // skip cache,
                                                                 // direct
                                                                 // load from
                                                                 // store
            if (input.equals(existingInfo.getInput()))
                return existing;
        }

        return null;
    }

    private String checkDupByContent(DictionaryInfo dictInfo, Dictionary<?> dict) throws IOException {
        ResourceStore store = MetadataManager.getInstance(config).getStore();
        ArrayList<String> existings = store.listResources(dictInfo.getResourceDir());
        if (existings == null)
            return null;

        for (String existing : existings) {
            DictionaryInfo existingInfo = load(existing, true); // skip cache,
                                                                // direct load
                                                                // from store
            if (existingInfo != null && dict.equals(existingInfo.getDictionaryObject()))
                return existing;
        }

        return null;
    }

    public void removeDictionary(String resourcePath) throws IOException {
        ResourceStore store = MetadataManager.getInstance(config).getStore();
        store.deleteResource(resourcePath);
        dictCache.remove(resourcePath);
    }

    public void removeDictionaries(String srcTable, String srcCol) throws IOException {
        DictionaryInfo info = new DictionaryInfo();
        info.setSourceTable(srcTable);
        info.setSourceColumn(srcCol);

        ResourceStore store = MetadataManager.getInstance(config).getStore();
        ArrayList<String> existings = store.listResources(info.getResourceDir());
        if (existings == null)
            return;

        for (String existing : existings)
            removeDictionary(existing);
    }

    void save(DictionaryInfo dict) throws IOException {
        ResourceStore store = MetadataManager.getInstance(config).getStore();
        String path = dict.getResourcePath();
        logger.info("Saving dictionary at " + path);
        store.putResource(path, dict, DictionaryInfoSerializer.FULL_SERIALIZER);
    }

    DictionaryInfo load(String resourcePath, boolean loadDictObj) throws IOException {
        ResourceStore store = MetadataManager.getInstance(config).getStore();

        DictionaryInfo info = store.getResource(resourcePath, DictionaryInfo.class,
                loadDictObj ? DictionaryInfoSerializer.FULL_SERIALIZER : DictionaryInfoSerializer.INFO_SERIALIZER);

        if (loadDictObj)
            logger.debug("Loaded dictionary at " + resourcePath);

        return info;
    }

}