Java tutorial
/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.plan.physical; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.avro.mapred.FsInput; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import com.linkedin.cubert.utils.CodeDictionary; /** * Executes a Map-Reduce job to generate and refresh dictionaries for the various columns * in a relation. * * Dictionaries will be created for columns that have {@code isDictionaryEncoded} set to * {@code true}. * * @author Maneesh Varshney * */ public class GenerateDictionary { // public static class CreateDictionaryMapper<K, V> extends Mapper<K, V, Text, Text> // { // private String[] columnNames; // private boolean[] isDictionaryField; // private List<Set<String>> emittedKeys = new ArrayList<Set<String>>(); // // private TupleCreator tupleCreator; // private final Text reuseKey = new Text(); // private final Text reuseValue = new Text(); // // private String replaceNull = ""; // private String defaultValue = null; // private boolean defaultEmmited = false; // // @Override // protected void setup(Context context) throws IOException, // InterruptedException // { // Configuration conf = context.getConfiguration(); // // FileCache.initialize(conf); // // FileCache cache = FileCache.get(); // // ObjectMapper mapper = new ObjectMapper(); // ArrayNode mapCommands = // mapper.readValue(conf.get(CubertStrings.JSON_MAP_OPERATOR_LIST), // ArrayNode.class); // JsonNode inputJson = mapCommands.get(0).get("input"); // JsonNode outputJson = // mapper.readValue(conf.get(CubertStrings.JSON_OUTPUT), JsonNode.class); // // if (inputJson.has("replaceNull")) // { // replaceNull = JsonUtils.getText(inputJson, "replaceNull"); // } // // if (inputJson.has("defaultValue")) // { // defaultValue = JsonUtils.getText(inputJson, "defaultValue"); // } // // tupleCreator = new AvroTupleCreator(); // tupleCreator.setup(inputJson); // // // Load the previous dictionary, if available // Map<String, CodeDictionary> dictionaries = null; // // String previousDict = conf.get(CubertStrings.DICTIONARY_RELATION); // System.out.println(">>> previous dict " + previousDict); // if (previousDict != null) // { // // String filename = cache.getCachedFile(previousDict); // // dictionaries = loadDictionary(filename, false, null); // } // // String outputColumns = JsonUtils.getText(outputJson, "columns"); // BlockSchema inputSchema = new BlockSchema(inputJson.get("schema")); // BlockSchema outputSchema = new BlockSchema(outputColumns); // Set<String> dictionaryColumnSet = outputSchema.asMap().keySet(); // // int numInputColumns = inputSchema.getNumColumns(); // columnNames = inputSchema.getColumnNames(); // // isDictionaryField = new boolean[numInputColumns]; // for (int i = 0; i < numInputColumns; i++) // { // String colName = inputSchema.getName(i); // isDictionaryField[i] = dictionaryColumnSet.contains(colName); // // if (isDictionaryField[i]) // { // Set<String> emitted = new HashSet<String>(); // emittedKeys.add(emitted); // // if (dictionaries != null && dictionaries.containsKey(colName)) // { // // Add the existing keys to emittedKeys, so that we don't emit // // them again // CodeDictionary dict = dictionaries.get(colName); // emittedKeys.get(i).addAll(dict.keySet()); // } // } // else // { // emittedKeys.add(null); // } // } // } // // @Override // protected void map(K key, V value, Context context) throws IOException, // InterruptedException // { // map(tupleCreator.create(key, value), context); // } // // void map(Tuple tuple, Context context) throws IOException, // InterruptedException // { // for (int i = 0; i < isDictionaryField.length; i++) // { // // if (!isDictionaryField[i]) // continue; // // Object val = tuple.get(i); // String colValue; // // if (val == null) // colValue = replaceNull; // else // colValue = val.toString(); // // if (emittedKeys.get(i).contains(colValue)) // { // continue; // } // // String colName = columnNames[i]; // emitKeyValue(colName, colValue, context, i); // // if (defaultValue != null && !defaultEmmited) // emitKeyValue(colName, defaultValue, context, i); // } // defaultEmmited = true; // } // // void emitKeyValue(String colName, // String colValue, // Context context, // int columnIndex) throws IOException, // InterruptedException // { // reuseKey.set(colName); // reuseValue.set(colValue); // context.write(reuseKey, reuseValue); // emittedKeys.get(columnIndex).add(colValue); // } // } // public static class CreateDictionaryReducer extends // Reducer<Text, Text, AvroKey<Record>, NullWritable> // { // Record record; // // @Override // protected void setup(Context context) throws IOException, // InterruptedException // { // Configuration conf = context.getConfiguration(); // Schema keySchema = AvroJob.getOutputKeySchema(conf); // // record = new Record(keySchema); // } // // @Override // protected void reduce(Text key, Iterable<Text> values, Context context) throws // IOException, // InterruptedException // { // record.put("colname", key.toString()); // record.put("code", -1); // // for (Text value : values) // { // record.put("colvalue", value.toString()); // context.write(new AvroKey<Record>(record), NullWritable.get()); // } // } // // } public static Map<String, CodeDictionary> loadDictionary(String path, boolean isHDFS, Configuration conf) throws IOException { Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>(); Schema schema = getSchema(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); DataFileReader<GenericRecord> dataFileReader; if (isHDFS) { dataFileReader = new DataFileReader<GenericRecord>(new FsInput(new Path(path), conf), datumReader); } else { dataFileReader = new DataFileReader<GenericRecord>(new File(path), datumReader); } GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(); String colName = record.get("colname").toString(); String colValue = record.get("colvalue").toString(); int code = (Integer) record.get("code"); CodeDictionary dict = dictionaries.get(colName); if (dict == null) { dict = new CodeDictionary(); dictionaries.put(colName, dict); } dict.addKeyCode(colValue, code); } dataFileReader.close(); return dictionaries; } public static void mergeDictionaries(Configuration conf, Path dir) throws IOException { Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>(); FileSystem fs = FileSystem.get(conf); Path currentDictPath = new Path(dir, "dictionary"); Schema schema = getSchema(); // Read the existing dictionaries if (fs.exists(currentDictPath)) { dictionaries.putAll(loadDictionary(currentDictPath.toString(), true, conf)); // move the current dictionary to new file Path oldPath = new Path(dir, "_dictionary.old"); fs.delete(oldPath, true); fs.rename(currentDictPath, oldPath); } // Read the new entries Path globPath = new Path(dir, "tmp/part-*"); FileStatus[] allStatus = fs.globStatus(globPath); for (FileStatus status : allStatus) { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>( new FsInput(status.getPath(), conf), datumReader); GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(); String colName = record.get("colname").toString(); String colValue = record.get("colvalue").toString(); CodeDictionary dict = dictionaries.get(colName); if (dict == null) { dict = new CodeDictionary(); dictionaries.put(colName, dict); } dict.addKey(colValue); } } // Write the dictionaries back DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); FSDataOutputStream out = fs.create(currentDictPath); dataFileWriter.create(schema, out); Record record = new Record(schema); for (Map.Entry<String, CodeDictionary> entry : dictionaries.entrySet()) { String colName = entry.getKey(); CodeDictionary dict = entry.getValue(); for (String colValue : dict.keySet()) { int code = dict.getCodeForKey(colValue); record.put("colname", colName); record.put("colvalue", colValue); record.put("code", code); dataFileWriter.append(record); } } dataFileWriter.close(); } public static Schema getSchema() { Field[] fields = { new Schema.Field("colname", Schema.create(Type.STRING), null, null), new Schema.Field("colvalue", Schema.create(Type.STRING), null, null), new Schema.Field("code", Schema.create(Type.INT), null, null) }; Schema schema = Schema.createRecord("dictionary", null, null, false); schema.setFields(Arrays.asList(fields)); return schema; } }