com.linkedin.cubert.plan.physical.GenerateDictionary.java Source code

Introduction

Here is the source code for com.linkedin.cubert.plan.physical.GenerateDictionary.java
Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.plan.physical;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.mapred.FsInput;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.linkedin.cubert.utils.CodeDictionary;

/**
 * Executes a Map-Reduce job to generate and refresh dictionaries for the various columns
 * in a relation.
 * 
 * Dictionaries will be created for columns that have {@code isDictionaryEncoded} set to
 * {@code true}.
 * 
 * @author Maneesh Varshney
 * 
 */
public class GenerateDictionary {
    // public static class CreateDictionaryMapper<K, V> extends Mapper<K, V, Text, Text>
    // {
    // private String[] columnNames;
    // private boolean[] isDictionaryField;
    // private List<Set<String>> emittedKeys = new ArrayList<Set<String>>();
    //
    // private TupleCreator tupleCreator;
    // private final Text reuseKey = new Text();
    // private final Text reuseValue = new Text();
    //
    // private String replaceNull = "";
    // private String defaultValue = null;
    // private boolean defaultEmmited = false;
    //
    // @Override
    // protected void setup(Context context) throws IOException,
    // InterruptedException
    // {
    // Configuration conf = context.getConfiguration();
    //
    // FileCache.initialize(conf);
    // // FileCache cache = FileCache.get();
    //
    // ObjectMapper mapper = new ObjectMapper();
    // ArrayNode mapCommands =
    // mapper.readValue(conf.get(CubertStrings.JSON_MAP_OPERATOR_LIST),
    // ArrayNode.class);
    // JsonNode inputJson = mapCommands.get(0).get("input");
    // JsonNode outputJson =
    // mapper.readValue(conf.get(CubertStrings.JSON_OUTPUT), JsonNode.class);
    //
    // if (inputJson.has("replaceNull"))
    // {
    // replaceNull = JsonUtils.getText(inputJson, "replaceNull");
    // }
    //
    // if (inputJson.has("defaultValue"))
    // {
    // defaultValue = JsonUtils.getText(inputJson, "defaultValue");
    // }
    //
    // tupleCreator = new AvroTupleCreator();
    // tupleCreator.setup(inputJson);
    //
    // // Load the previous dictionary, if available
    // Map<String, CodeDictionary> dictionaries = null;
    //
    // String previousDict = conf.get(CubertStrings.DICTIONARY_RELATION);
    // System.out.println(">>> previous dict " + previousDict);
    // if (previousDict != null)
    // {
    // // String filename = cache.getCachedFile(previousDict);
    // // dictionaries = loadDictionary(filename, false, null);
    // }
    //
    // String outputColumns = JsonUtils.getText(outputJson, "columns");
    // BlockSchema inputSchema = new BlockSchema(inputJson.get("schema"));
    // BlockSchema outputSchema = new BlockSchema(outputColumns);
    // Set<String> dictionaryColumnSet = outputSchema.asMap().keySet();
    //
    // int numInputColumns = inputSchema.getNumColumns();
    // columnNames = inputSchema.getColumnNames();
    //
    // isDictionaryField = new boolean[numInputColumns];
    // for (int i = 0; i < numInputColumns; i++)
    // {
    // String colName = inputSchema.getName(i);
    // isDictionaryField[i] = dictionaryColumnSet.contains(colName);
    //
    // if (isDictionaryField[i])
    // {
    // Set<String> emitted = new HashSet<String>();
    // emittedKeys.add(emitted);
    //
    // if (dictionaries != null && dictionaries.containsKey(colName))
    // {
    // // Add the existing keys to emittedKeys, so that we don't emit
    // // them again
    // CodeDictionary dict = dictionaries.get(colName);
    // emittedKeys.get(i).addAll(dict.keySet());
    // }
    // }
    // else
    // {
    // emittedKeys.add(null);
    // }
    // }
    // }
    //
    // @Override
    // protected void map(K key, V value, Context context) throws IOException,
    // InterruptedException
    // {
    // map(tupleCreator.create(key, value), context);
    // }
    //
    // void map(Tuple tuple, Context context) throws IOException,
    // InterruptedException
    // {
    // for (int i = 0; i < isDictionaryField.length; i++)
    // {
    //
    // if (!isDictionaryField[i])
    // continue;
    //
    // Object val = tuple.get(i);
    // String colValue;
    //
    // if (val == null)
    // colValue = replaceNull;
    // else
    // colValue = val.toString();
    //
    // if (emittedKeys.get(i).contains(colValue))
    // {
    // continue;
    // }
    //
    // String colName = columnNames[i];
    // emitKeyValue(colName, colValue, context, i);
    //
    // if (defaultValue != null && !defaultEmmited)
    // emitKeyValue(colName, defaultValue, context, i);
    // }
    // defaultEmmited = true;
    // }
    //
    // void emitKeyValue(String colName,
    // String colValue,
    // Context context,
    // int columnIndex) throws IOException,
    // InterruptedException
    // {
    // reuseKey.set(colName);
    // reuseValue.set(colValue);
    // context.write(reuseKey, reuseValue);
    // emittedKeys.get(columnIndex).add(colValue);
    // }
    // }

    // public static class CreateDictionaryReducer extends
    // Reducer<Text, Text, AvroKey<Record>, NullWritable>
    // {
    // Record record;
    //
    // @Override
    // protected void setup(Context context) throws IOException,
    // InterruptedException
    // {
    // Configuration conf = context.getConfiguration();
    // Schema keySchema = AvroJob.getOutputKeySchema(conf);
    //
    // record = new Record(keySchema);
    // }
    //
    // @Override
    // protected void reduce(Text key, Iterable<Text> values, Context context) throws
    // IOException,
    // InterruptedException
    // {
    // record.put("colname", key.toString());
    // record.put("code", -1);
    //
    // for (Text value : values)
    // {
    // record.put("colvalue", value.toString());
    // context.write(new AvroKey<Record>(record), NullWritable.get());
    // }
    // }
    //
    // }

    public static Map<String, CodeDictionary> loadDictionary(String path, boolean isHDFS, Configuration conf)
            throws IOException {
        Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>();
        Schema schema = getSchema();

        DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
        DataFileReader<GenericRecord> dataFileReader;

        if (isHDFS) {
            dataFileReader = new DataFileReader<GenericRecord>(new FsInput(new Path(path), conf), datumReader);
        } else {
            dataFileReader = new DataFileReader<GenericRecord>(new File(path), datumReader);
        }
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next();
            String colName = record.get("colname").toString();
            String colValue = record.get("colvalue").toString();
            int code = (Integer) record.get("code");

            CodeDictionary dict = dictionaries.get(colName);
            if (dict == null) {
                dict = new CodeDictionary();
                dictionaries.put(colName, dict);
            }

            dict.addKeyCode(colValue, code);
        }

        dataFileReader.close();

        return dictionaries;
    }

    public static void mergeDictionaries(Configuration conf, Path dir) throws IOException {
        Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>();
        FileSystem fs = FileSystem.get(conf);

        Path currentDictPath = new Path(dir, "dictionary");
        Schema schema = getSchema();

        // Read the existing dictionaries
        if (fs.exists(currentDictPath)) {
            dictionaries.putAll(loadDictionary(currentDictPath.toString(), true, conf));

            // move the current dictionary to new file
            Path oldPath = new Path(dir, "_dictionary.old");
            fs.delete(oldPath, true);
            fs.rename(currentDictPath, oldPath);
        }

        // Read the new entries
        Path globPath = new Path(dir, "tmp/part-*");
        FileStatus[] allStatus = fs.globStatus(globPath);
        for (FileStatus status : allStatus) {
            DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
            DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(
                    new FsInput(status.getPath(), conf), datumReader);
            GenericRecord record = null;
            while (dataFileReader.hasNext()) {
                record = dataFileReader.next();
                String colName = record.get("colname").toString();
                String colValue = record.get("colvalue").toString();

                CodeDictionary dict = dictionaries.get(colName);
                if (dict == null) {
                    dict = new CodeDictionary();
                    dictionaries.put(colName, dict);
                }

                dict.addKey(colValue);
            }
        }

        // Write the dictionaries back
        DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
        FSDataOutputStream out = fs.create(currentDictPath);

        dataFileWriter.create(schema, out);
        Record record = new Record(schema);

        for (Map.Entry<String, CodeDictionary> entry : dictionaries.entrySet()) {
            String colName = entry.getKey();
            CodeDictionary dict = entry.getValue();

            for (String colValue : dict.keySet()) {
                int code = dict.getCodeForKey(colValue);
                record.put("colname", colName);
                record.put("colvalue", colValue);
                record.put("code", code);

                dataFileWriter.append(record);
            }
        }
        dataFileWriter.close();

    }

    public static Schema getSchema() {
        Field[] fields = { new Schema.Field("colname", Schema.create(Type.STRING), null, null),
                new Schema.Field("colvalue", Schema.create(Type.STRING), null, null),
                new Schema.Field("code", Schema.create(Type.INT), null, null) };

        Schema schema = Schema.createRecord("dictionary", null, null, false);
        schema.setFields(Arrays.asList(fields));

        return schema;
    }
}