Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import scala.Tuple2; import com.google.common.collect.Ordering; import com.ibm.bi.dml.runtime.transform.MVImputeAgent.MVMethod; import com.ibm.bi.dml.runtime.util.UtilFunctions; public class RecodeAgent extends TransformationAgent { private static final long serialVersionUID = 8213163881283341874L; private int[] _rcdList = null; private int[] _mvrcdList = null; private int[] _fullrcdList = null; // HashMap< columnID, HashMap<distinctValue, count> > private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>(); RecodeAgent(JSONObject parsedSpec) throws JSONException { int rcdCount = 0; if (parsedSpec.containsKey(TX_METHOD.RECODE.toString())) { JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.RECODE.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); _rcdList = new int[attrs.size()]; for (int i = 0; i < _rcdList.length; i++) _rcdList[i] = UtilFunctions.toInt(attrs.get(i)); rcdCount = _rcdList.length; } if (parsedSpec.containsKey(TX_METHOD.MVRCD.toString())) { JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.MVRCD.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); _mvrcdList = new int[attrs.size()]; for (int i = 0; i < _mvrcdList.length; i++) _mvrcdList[i] = UtilFunctions.toInt(attrs.get(i)); rcdCount += attrs.size(); } if (rcdCount > 0) { _fullrcdList = new int[rcdCount]; int idx = -1; if (_rcdList != null) for (int i = 0; i < _rcdList.length; i++) _fullrcdList[++idx] = _rcdList[i]; if (_mvrcdList != null) for (int i = 0; i < _mvrcdList.length; i++) _fullrcdList[++idx] = _mvrcdList[i]; } } void prepare(String[] words, TfUtils agents) { if (_rcdList == null && _mvrcdList == null) return; String w = null; for (int colID : _fullrcdList) { w = UtilFunctions.unquote(words[colID - 1].trim()); if (_rcdMaps.get(colID) == null) _rcdMaps.put(colID, new HashMap<String, Long>()); HashMap<String, Long> map = _rcdMaps.get(colID); Long count = map.get(w); if (count == null) map.put(w, new Long(1)); else map.put(w, count + 1); } } private HashMap<String, Long> handleMVConstant(int colID, TfUtils agents, HashMap<String, Long> map) { MVImputeAgent mvagent = agents.getMVImputeAgent(); if (mvagent.getMethod(colID) == MVMethod.CONSTANT) { // check if the "replacement" is part of the map. If not, add it. String repValue = mvagent.getReplacement(colID); if (repValue == null) throw new RuntimeException("Expecting a constant replacement value for column ID " + colID); repValue = UtilFunctions.unquote(repValue); Long count = map.get(repValue); long mvCount = agents.getValid() - mvagent.getNonMVCount(colID); if (count == null) map.put(repValue, mvCount); else map.put(repValue, count + mvCount); } return map; } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. * * @param out * @throws IOException */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { mapOutputHelper(taskID, out, null, agents); } public ArrayList<Tuple2<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException { mapOutputHelper(taskID, null, list, agents); return list; } public void mapOutputHelper(int taskID, OutputCollector<IntWritable, DistinctValue> out, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException { if (_rcdList == null && _mvrcdList == null) return; try { for (int i = 0; i < _fullrcdList.length; i++) { int colID = _fullrcdList[i]; HashMap<String, Long> map = _rcdMaps.get(colID); if (map != null) { map = handleMVConstant(colID, agents, map); if (out != null) { IntWritable iw = new IntWritable(colID); for (String s : map.keySet()) out.collect(iw, new DistinctValue(s, map.get(s))); } else if (list != null) { for (String s : map.keySet()) list.add(new Tuple2<Integer, DistinctValue>(colID, new DistinctValue(s, map.get(s)))); } } } } catch (Exception e) { throw new IOException(e); } } /** * Function to output transformation metadata, including: * - recode maps, * - number of distinct values, * - mode, and * - imputation value (in the case of global_mode) * * The column for which this function is invoked can be one of the following: * - just recoded (write .map, .ndistinct, .mode) * - just mv imputed (w/ global_mode) (write .impute) * - both recoded and mv imputed (write .map, .ndistinct, .mode, .impute) * * @param map * @param outputDir * @param colID * @param fs * @param mvagent * @throws IOException */ private void writeMetadata(HashMap<String, Long> map, String outputDir, int colID, FileSystem fs, TfUtils agents, boolean fromCP) throws IOException { // output recode maps and mode MVImputeAgent mvagent = agents.getMVImputeAgent(); String mode = null; Long count = null; int rcdIndex = 0, modeIndex = 0; long maxCount = Long.MIN_VALUE; boolean isRecoded = (isRecoded(colID) != -1); boolean isModeImputed = (mvagent.getMethod(colID) == MVMethod.GLOBAL_MODE); Path pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + RCD_MAP_FILE_SUFFIX); BufferedWriter br = null; if (isRecoded) br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); // remove NA strings if (agents.getNAStrings() != null) for (String naword : agents.getNAStrings()) map.remove(naword); if (fromCP) map = handleMVConstant(colID, agents, map); if (map.size() == 0) throw new RuntimeException("Can not proceed since \"" + agents.getName(colID) + "\" (id=" + colID + ") contains only the missing values, and not a single valid value -- set imputation method to \"constant\"."); // Order entries by category (string) value Ordering<String> valueComparator = Ordering.natural(); List<String> newNames = valueComparator.sortedCopy(map.keySet()); for (String w : newNames) { //map.keySet()) { count = map.get(w); ++rcdIndex; // output (w, count, rcdIndex) if (br != null) br.write(UtilFunctions.quote(w) + TXMTD_SEP + rcdIndex + TXMTD_SEP + count + "\n"); if (maxCount < count) { maxCount = count; mode = w; modeIndex = rcdIndex; } // Replace count with recode index (useful when invoked from CP) map.put(w, (long) rcdIndex); } if (br != null) br.close(); if (mode == null) { mode = ""; maxCount = 0; } if (isRecoded) { // output mode pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + MODE_FILE_SUFFIX); br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write(UtilFunctions.quote(mode) + "," + modeIndex + "," + maxCount); br.close(); // output number of distinct values pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + NDISTINCT_FILE_SUFFIX); br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write("" + map.size()); br.close(); } if (isModeImputed) { pt = new Path(outputDir + "/Impute/" + agents.getName(colID) + MV_FILE_SUFFIX); br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write(colID + "," + UtilFunctions.quote(mode)); br.close(); } } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { if (_rcdList == null && _mvrcdList == null) return; for (int i = 0; i < _fullrcdList.length; i++) { int colID = _fullrcdList[i]; writeMetadata(_rcdMaps.get(colID), outputDir, colID, fs, agents, true); } } /** * Method to merge map output transformation metadata. * * @param values * @return * @throws IOException */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { HashMap<String, Long> map = new HashMap<String, Long>(); DistinctValue d = new DistinctValue(); String word = null; Long count = null, val = null; while (values.hasNext()) { d.reset(); d = values.next(); word = d.getWord(); count = d.getCount(); val = map.get(word); if (val == null) map.put(word, count); else map.put(word, val + count); } writeMetadata(map, outputDir, colID, fs, agents, false); } // ------------------------------------------------------------------------------------------------ public HashMap<Integer, HashMap<String, Long>> getCPRecodeMaps() { return _rcdMaps; } HashMap<Integer, HashMap<String, String>> _finalMaps = null; public HashMap<Integer, HashMap<String, String>> getRecodeMaps() { return _finalMaps; } /** * Method to load recode maps of all attributes, at once. * * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (_rcdList == null) return; _finalMaps = new HashMap<Integer, HashMap<String, String>>(); if (fs.isDirectory(txMtdDir)) { for (int i = 0; i < _rcdList.length; i++) { int colID = _rcdList[i]; Path path = new Path(txMtdDir + "/Recode/" + agents.getName(colID) + RCD_MAP_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); HashMap<String, String> map = new HashMap<String, String>(); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = null, word = null; String rcdIndex = null; // Example line to parse: "WN (1)67492",1,61975 while ((line = br.readLine()) != null) { // last occurrence of quotation mark int idxQuote = line.lastIndexOf('"'); word = UtilFunctions.unquote(line.substring(0, idxQuote + 1)); int idx = idxQuote + 2; while (line.charAt(idx) != TXMTD_SEP.charAt(0)) idx++; rcdIndex = line.substring(idxQuote + 2, idx); map.put(word, rcdIndex); } br.close(); _finalMaps.put(colID, map); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } } /** * Method to apply transformations. * * @param words * @return */ @Override public String[] apply(String[] words, TfUtils agents) { if (_rcdList == null) return words; for (int i = 0; i < _rcdList.length; i++) { int colID = _rcdList[i]; try { words[colID - 1] = _finalMaps.get(colID).get(UtilFunctions.unquote(words[colID - 1].trim())); } catch (NullPointerException e) { System.err .println("Maps for colID=" + colID + " may be null (map = " + _finalMaps.get(colID) + ")"); throw new RuntimeException(e); } } return words; } /** * Check if the given column ID is subjected to this transformation. * */ public int isRecoded(int colID) { if (_rcdList == null) return -1; int idx = Arrays.binarySearch(_rcdList, colID); return (idx >= 0 ? idx : -1); } public String[] cp_apply(String[] words, TfUtils agents) { if (_rcdList == null) return words; String w = null; for (int i = 0; i < _rcdList.length; i++) { int colID = _rcdList[i]; try { w = UtilFunctions.unquote(words[colID - 1].trim()); words[colID - 1] = Long.toString(_rcdMaps.get(colID).get(w)); } catch (NullPointerException e) { if (w.isEmpty() && agents.isNA("")) throw new RuntimeException("Empty string (a missing value) in column ID " + colID + " is not handled. Consider adding an imputation method on this column."); throw new RuntimeException("ColID=" + colID + ", word=" + words[colID - 1] + ", maps entry not found (map = " + _rcdMaps.get(colID) + ")"); } } return words; } public void printMaps() { for (Integer k : _rcdMaps.keySet()) { System.out.println("Column " + k); HashMap<String, Long> map = _rcdMaps.get(k); for (String w : map.keySet()) { System.out.println(" " + w + " : " + map.get(w)); } } } public void print() { System.out.print("Recoding List: \n "); for (int i : _rcdList) { System.out.print(i + " "); } System.out.println(); } }