Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import com.google.common.base.Functions; import com.google.common.collect.Ordering; import com.ibm.bi.dml.runtime.util.UtilFunctions; public class DummycodeAgent extends TransformationAgent { private static final long serialVersionUID = 5832130477659116489L; private int[] _dcdList = null; private long numCols = 0; private HashMap<Integer, HashMap<String, String>> _finalMaps = null; private HashMap<Integer, HashMap<String, Long>> _finalMapsCP = null; private int[] _binList = null; private int[] _numBins = null; private int[] _domainSizes = null; // length = #of dummycoded columns private int[] _dcdColumnMap = null; // to help in translating between original and dummycoded column IDs private long _dummycodedLength = 0; // #of columns after dummycoded DummycodeAgent(int[] list) { _dcdList = list; } DummycodeAgent(JSONObject parsedSpec, long ncol) throws JSONException { numCols = ncol; if (!parsedSpec.containsKey(TX_METHOD.DUMMYCODE.toString())) return; JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.DUMMYCODE.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); _dcdList = new int[attrs.size()]; for (int i = 0; i < _dcdList.length; i++) _dcdList[i] = UtilFunctions.toInt(attrs.get(i)); } public int[] dcdList() { return _dcdList; } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. * * @param out * @throws IOException * */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { // There is no metadata required for dummycode. // Required information is output from RecodeAgent. return; } @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { // Nothing to do here } public void setRecodeMaps(HashMap<Integer, HashMap<String, String>> maps) { _finalMaps = maps; } public void setRecodeMapsCP(HashMap<Integer, HashMap<String, Long>> maps) { _finalMapsCP = maps; } public void setNumBins(int[] binList, int[] numbins) { _binList = binList; _numBins = numbins; } /** * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data. * * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end] * 1/0 indicates if ColID is dummycoded or not * [st,end] is the range of dummycoded column numbers for the given ColID * * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output. * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type * dummycoded, and the remaining are of type scale. * * @param fs * @param txMtdDir * @param numCols * @param ra * @param ba * @return Number of columns in the transformed data * @throws IOException */ public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents) throws IOException { // initialize all column types in the transformed data to SCALE ColumnTypes[] ctypes = new ColumnTypes[(int) _dummycodedLength]; for (int i = 0; i < _dummycodedLength; i++) ctypes[i] = ColumnTypes.SCALE; _dcdColumnMap = new int[numCols]; Path pt = new Path(txMtdDir + "/Dummycode/" + DCD_FILE_NAME); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); int sum = 1; int idx = 0; for (int colID = 1; colID <= numCols; colID++) { if (_dcdList != null && idx < _dcdList.length && _dcdList[idx] == colID) { br.write(colID + "," + "1" + "," + sum + "," + (sum + _domainSizes[idx] - 1) + "\n"); _dcdColumnMap[colID - 1] = (sum + _domainSizes[idx] - 1) - 1; for (int i = sum; i <= (sum + _domainSizes[idx] - 1); i++) ctypes[i - 1] = ColumnTypes.DUMMYCODED; sum += _domainSizes[idx]; idx++; } else { br.write(colID + "," + "0" + "," + sum + "," + sum + "\n"); _dcdColumnMap[colID - 1] = sum - 1; if (agents.getBinAgent().isBinned(colID) != -1) ctypes[sum - 1] = ColumnTypes.ORDINAL; // binned variable results in an ordinal column if (agents.getRecodeAgent().isRecoded(colID) != -1) ctypes[sum - 1] = ColumnTypes.NOMINAL; sum += 1; } } br.close(); // Write coltypes.csv pt = new Path(txMtdDir + "/" + COLTYPES_FILE_NAME); br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); br.write(columnTypeToID(ctypes[0]) + ""); for (int i = 1; i < _dummycodedLength; i++) br.write("," + columnTypeToID(ctypes[i])); br.close(); return sum - 1; } /** * Given a dummycoded column id, find the corresponding original column ID. * * @param colID * @return */ public int mapDcdColumnID(int colID) { for (int i = 0; i < _dcdColumnMap.length; i++) { int st = (i == 0 ? 1 : _dcdColumnMap[i - 1] + 1 + 1); int end = _dcdColumnMap[i] + 1; //System.out.println((i+1) + ": " + "[" + st + "," + end + "]"); if (colID >= st && colID <= end) return i + 1; } return -1; } public String constructDummycodedHeader(String header, Pattern delim) { if (_dcdList == null && _binList == null) // none of the columns are dummycoded, simply return the given header return header; String[] names = delim.split(header, -1); List<String> newNames = null; StringBuilder sb = new StringBuilder(); // Dummycoding can be performed on either on a recoded column or on a binned column // process recoded columns if (_finalMapsCP != null && _dcdList != null) { for (int i = 0; i < _dcdList.length; i++) { int colID = _dcdList[i]; HashMap<String, Long> map = _finalMapsCP.get(colID); String colName = UtilFunctions.unquote(names[colID - 1]); if (map != null) { // order map entries by their recodeID Ordering<String> valueComparator = Ordering.natural().onResultOf(Functions.forMap(map)); newNames = valueComparator.sortedCopy(map.keySet()); // construct concatenated string of map entries sb.setLength(0); for (int idx = 0; idx < newNames.size(); idx++) { if (idx == 0) sb.append(colName + DCD_NAME_SEP + newNames.get(idx)); else sb.append(delim + colName + DCD_NAME_SEP + newNames.get(idx)); } names[colID - 1] = sb.toString(); // replace original column name with dcd name } } } else if (_finalMaps != null && _dcdList != null) { for (int i = 0; i < _dcdList.length; i++) { int colID = _dcdList[i]; HashMap<String, String> map = _finalMaps.get(colID); String colName = UtilFunctions.unquote(names[colID - 1]); if (map != null) { // order map entries by their recodeID (represented as Strings .. "1", "2", etc.) Ordering<String> orderByID = new Ordering<String>() { public int compare(String s1, String s2) { return (Integer.parseInt(s1) - Integer.parseInt(s2)); } }; newNames = orderByID.onResultOf(Functions.forMap(map)).sortedCopy(map.keySet()); // construct concatenated string of map entries sb.setLength(0); for (int idx = 0; idx < newNames.size(); idx++) { if (idx == 0) sb.append(colName + DCD_NAME_SEP + newNames.get(idx)); else sb.append(delim + colName + DCD_NAME_SEP + newNames.get(idx)); } names[colID - 1] = sb.toString(); // replace original column name with dcd name } } } // process binned columns if (_binList != null) for (int i = 0; i < _binList.length; i++) { int colID = _binList[i]; // need to consider only binned and dummycoded columns if (isDummyCoded(colID) == -1) continue; int numBins = _numBins[i]; String colName = UtilFunctions.unquote(names[colID - 1]); sb.setLength(0); for (int idx = 0; idx < numBins; idx++) if (idx == 0) sb.append(colName + DCD_NAME_SEP + "Bin" + (idx + 1)); else sb.append(delim + colName + DCD_NAME_SEP + "Bin" + (idx + 1)); names[colID - 1] = sb.toString(); // replace original column name with dcd name } // Construct the full header sb.setLength(0); for (int colID = 0; colID < names.length; colID++) { if (colID == 0) sb.append(names[colID]); else sb.append(delim + names[colID]); } //System.out.println("DummycodedHeader: " + sb.toString()); return sb.toString(); } @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if (_dcdList == null) { _dummycodedLength = numCols; return; } // sort to-be dummycoded column IDs in ascending order. This is the order in which the new dummycoded record is constructed in apply() function. Arrays.sort(_dcdList); _domainSizes = new int[_dcdList.length]; _dummycodedLength = numCols; //HashMap<String, String> map = null; for (int i = 0; i < _dcdList.length; i++) { int colID = _dcdList[i]; // Find the domain size for colID using _finalMaps or _finalMapsCP int domainSize = 0; if (_finalMaps != null) { if (_finalMaps.get(colID) != null) domainSize = _finalMaps.get(colID).size(); } else { if (_finalMapsCP.get(colID) != null) domainSize = _finalMapsCP.get(colID).size(); } if (domainSize != 0) { // dummycoded column _domainSizes[i] = domainSize; } else { // binned column if (_binList != null) for (int j = 0; j < _binList.length; j++) { if (colID == _binList[j]) { _domainSizes[i] = _numBins[j]; break; } } } _dummycodedLength += _domainSizes[i] - 1; //System.out.println("colID=" + colID + ", domainsize=" + _domainSizes[i] + ", dcdLength=" + _dummycodedLength); } } /** * Method to apply transformations. * * @param words * @return */ @Override public String[] apply(String[] words, TfUtils agents) { if (_dcdList == null) return words; String[] nwords = new String[(int) _dummycodedLength]; int rcdVal = 0; for (int colID = 1, idx = 0, ncolID = 1; colID <= words.length; colID++) { if (idx < _dcdList.length && colID == _dcdList[idx]) { // dummycoded columns try { rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID - 1])); nwords[ncolID - 1 + rcdVal - 1] = "1"; ncolID += _domainSizes[idx]; idx++; } catch (Exception e) { System.out.println("Error in dummycoding: colID=" + colID + ", rcdVal=" + rcdVal + ", word=" + words[colID - 1] + ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength); throw new RuntimeException(e); } } else { nwords[ncolID - 1] = words[colID - 1]; ncolID++; } } return nwords; } /** * Check if the given column ID is subjected to this transformation. * */ public int isDummyCoded(int colID) { if (_dcdList == null) return -1; int idx = Arrays.binarySearch(_dcdList, colID); return (idx >= 0 ? idx : -1); } @Override public void print() { System.out.print("Dummycoding List: \n "); for (int i : _dcdList) { System.out.print(i + " "); } System.out.println(); } }