Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.functionobjects.CM; import org.apache.sysml.runtime.functionobjects.KahanPlus; import org.apache.sysml.runtime.functionobjects.Mean; import org.apache.sysml.runtime.instructions.cp.CM_COV_Object; import org.apache.sysml.runtime.instructions.cp.KahanObject; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.matrix.operators.CMOperator; import org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes; import org.apache.sysml.runtime.transform.encode.Encoder; import org.apache.sysml.runtime.transform.meta.TfMetaUtils; import org.apache.sysml.runtime.util.UtilFunctions; public class MVImputeAgent extends Encoder { private static final long serialVersionUID = 9057868620144662194L; public static final String MEAN_PREFIX = "mean"; public static final String VARIANCE_PREFIX = "var"; public static final String CORRECTION_PREFIX = "correction"; public static final String COUNT_PREFIX = "validcount"; // #of valid or non-missing values in a column public static final String TOTAL_COUNT_PREFIX = "totalcount"; // #of total records processed by a mapper public static final String CONSTANT_PREFIX = "constant"; public enum MVMethod { INVALID, GLOBAL_MEAN, GLOBAL_MODE, CONSTANT }; private MVMethod[] _mvMethodList = null; private MVMethod[] _mvscMethodList = null; // scaling methods for attributes that are imputed and also scaled private BitSet _isMVScaled = null; private CM _varFn = CM.getCMFnObject(AggregateOperationTypes.VARIANCE); // function object that understands variance computation // objects required to compute mean and variance of all non-missing entries private Mean _meanFn = Mean.getMeanFnObject(); // function object that understands mean computation private KahanObject[] _meanList = null; // column-level means, computed so far private long[] _countList = null; // #of non-missing values private CM_COV_Object[] _varList = null; // column-level variances, computed so far (for scaling) private int[] _scnomvList = null; // List of attributes that are scaled but not imputed private MVMethod[] _scnomvMethodList = null; // scaling methods: 0 for invalid; 1 for mean-subtraction; 2 for z-scoring private KahanObject[] _scnomvMeanList = null; // column-level means, for attributes scaled but not imputed private long[] _scnomvCountList = null; // #of non-missing values, for attributes scaled but not imputed private CM_COV_Object[] _scnomvVarList = null; // column-level variances, computed so far private String[] _replacementList = null; // replacements: for global_mean, mean; and for global_mode, recode id of mode category private String[] _NAstrings = null; private List<Integer> _rcList = null; private HashMap<Integer, HashMap<String, Long>> _hist = null; public String[] getReplacements() { return _replacementList; } public KahanObject[] getMeans() { return _meanList; } public CM_COV_Object[] getVars() { return _varList; } public KahanObject[] getMeans_scnomv() { return _scnomvMeanList; } public CM_COV_Object[] getVars_scnomv() { return _scnomvVarList; } public MVImputeAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException { super(null, clen); //handle column list int[] collist = TfMetaUtils.parseJsonObjectIDList(parsedSpec, colnames, TfUtils.TXMETHOD_IMPUTE); initColList(collist); //handle method list parseMethodsAndReplacments(parsedSpec); //create reuse histograms _hist = new HashMap<Integer, HashMap<String, Long>>(); } public MVImputeAgent(JSONObject parsedSpec, String[] colnames, String[] NAstrings, int clen) throws JSONException { super(null, clen); boolean isMV = parsedSpec.containsKey(TfUtils.TXMETHOD_IMPUTE); boolean isSC = parsedSpec.containsKey(TfUtils.TXMETHOD_SCALE); _NAstrings = NAstrings; if (!isMV) { // MV Impute is not applicable _colList = null; _mvMethodList = null; _meanList = null; _countList = null; _replacementList = null; } else { JSONObject mvobj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE); JSONArray mvattrs = (JSONArray) mvobj.get(TfUtils.JSON_ATTRS); JSONArray mvmthds = (JSONArray) mvobj.get(TfUtils.JSON_MTHD); int mvLength = mvattrs.size(); _colList = new int[mvLength]; _mvMethodList = new MVMethod[mvLength]; _meanList = new KahanObject[mvLength]; _countList = new long[mvLength]; _varList = new CM_COV_Object[mvLength]; _isMVScaled = new BitSet(_colList.length); _isMVScaled.clear(); for (int i = 0; i < _colList.length; i++) { _colList[i] = UtilFunctions.toInt(mvattrs.get(i)); _mvMethodList[i] = MVMethod.values()[UtilFunctions.toInt(mvmthds.get(i))]; _meanList[i] = new KahanObject(0, 0); } _replacementList = new String[mvLength]; // contains replacements for all columns (scale and categorical) JSONArray constants = (JSONArray) mvobj.get(TfUtils.JSON_CONSTS); for (int i = 0; i < constants.size(); i++) { if (constants.get(i) == null) _replacementList[i] = "NaN"; else _replacementList[i] = constants.get(i).toString(); } } // Handle scaled attributes if (!isSC) { // scaling is not applicable _scnomvCountList = null; _scnomvMeanList = null; _scnomvVarList = null; } else { if (_colList != null) _mvscMethodList = new MVMethod[_colList.length]; JSONObject scobj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_SCALE); JSONArray scattrs = (JSONArray) scobj.get(TfUtils.JSON_ATTRS); JSONArray scmthds = (JSONArray) scobj.get(TfUtils.JSON_MTHD); int scLength = scattrs.size(); int[] _allscaled = new int[scLength]; int scnomv = 0, colID; byte mthd; for (int i = 0; i < scLength; i++) { colID = UtilFunctions.toInt(scattrs.get(i)); mthd = (byte) UtilFunctions.toInt(scmthds.get(i)); _allscaled[i] = colID; // check if the attribute is also MV imputed int mvidx = isApplicable(colID); if (mvidx != -1) { _isMVScaled.set(mvidx); _mvscMethodList[mvidx] = MVMethod.values()[mthd]; _varList[mvidx] = new CM_COV_Object(); } else scnomv++; // count of scaled but not imputed } if (scnomv > 0) { _scnomvList = new int[scnomv]; _scnomvMethodList = new MVMethod[scnomv]; _scnomvMeanList = new KahanObject[scnomv]; _scnomvCountList = new long[scnomv]; _scnomvVarList = new CM_COV_Object[scnomv]; for (int i = 0, idx = 0; i < scLength; i++) { colID = UtilFunctions.toInt(scattrs.get(i)); mthd = (byte) UtilFunctions.toInt(scmthds.get(i)); if (isApplicable(colID) == -1) { // scaled but not imputed _scnomvList[idx] = colID; _scnomvMethodList[idx] = MVMethod.values()[mthd]; _scnomvMeanList[idx] = new KahanObject(0, 0); _scnomvVarList[idx] = new CM_COV_Object(); idx++; } } } } } private void parseMethodsAndReplacments(JSONObject parsedSpec) throws JSONException { JSONArray mvspec = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE); _mvMethodList = new MVMethod[mvspec.size()]; _replacementList = new String[mvspec.size()]; _meanList = new KahanObject[mvspec.size()]; _countList = new long[mvspec.size()]; for (int i = 0; i < mvspec.size(); i++) { JSONObject mvobj = (JSONObject) mvspec.get(i); _mvMethodList[i] = MVMethod.valueOf(mvobj.get("method").toString().toUpperCase()); if (_mvMethodList[i] == MVMethod.CONSTANT) { _replacementList[i] = mvobj.getString("value").toString(); } _meanList[i] = new KahanObject(0, 0); } } public void prepare(String[] words) throws IOException { try { String w = null; if (_colList != null) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; w = UtilFunctions.unquote(words[colID - 1].trim()); try { if (!TfUtils.isNA(_NAstrings, w)) { _countList[i]++; boolean computeMean = (_mvMethodList[i] == MVMethod.GLOBAL_MEAN || _isMVScaled.get(i)); if (computeMean) { // global_mean double d = UtilFunctions.parseToDouble(w); _meanFn.execute2(_meanList[i], d, _countList[i]); if (_isMVScaled.get(i) && _mvscMethodList[i] == MVMethod.GLOBAL_MODE) _varFn.execute(_varList[i], d); } else { // global_mode or constant // Nothing to do here. Mode is computed using recode maps. } } } catch (NumberFormatException e) { throw new RuntimeException("Encountered \"" + w + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + w + "\" to na.strings, along with an appropriate imputation method."); } } // Compute mean and variance for attributes that are scaled but not imputed if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) { int colID = _scnomvList[i]; w = UtilFunctions.unquote(words[colID - 1].trim()); double d = UtilFunctions.parseToDouble(w); _scnomvCountList[i]++; // not required, this is always equal to total #records processed _meanFn.execute2(_scnomvMeanList[i], d, _scnomvCountList[i]); if (_scnomvMethodList[i] == MVMethod.GLOBAL_MODE) _varFn.execute(_scnomvVarList[i], d); } } catch (Exception e) { throw new IOException(e); } } // ---------------------------------------------------------------------------------------------------------- private String encodeCMObj(CM_COV_Object obj) { StringBuilder sb = new StringBuilder(); sb.append(obj.w); sb.append(","); sb.append(obj.mean._sum); sb.append(","); sb.append(obj.mean._correction); sb.append(","); sb.append(obj.m2._sum); sb.append(","); sb.append(obj.m2._correction); return sb.toString(); } private CM_COV_Object decodeCMObj(String s) { CM_COV_Object obj = new CM_COV_Object(); String[] parts = s.split(","); obj.w = UtilFunctions.parseToDouble(parts[0]); obj.mean._sum = UtilFunctions.parseToDouble(parts[1]); obj.mean._correction = UtilFunctions.parseToDouble(parts[2]); obj.m2._sum = UtilFunctions.parseToDouble(parts[3]); obj.m2._correction = UtilFunctions.parseToDouble(parts[4]); return obj; } private DistinctValue prepMeanOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException { MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]); if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) { String suffix = null; if (scnomv) suffix = "scnomv"; else if (mthd == MVMethod.GLOBAL_MEAN && _isMVScaled.get(idx)) suffix = "scmv"; // both scaled and mv imputed else if (mthd == MVMethod.GLOBAL_MEAN) suffix = "noscmv"; else suffix = "scnomv"; sb.setLength(0); sb.append(MEAN_PREFIX); sb.append("_"); sb.append(taskID); sb.append("_"); double mean = (scnomv ? _scnomvMeanList[idx]._sum : _meanList[idx]._sum); sb.append(Double.toString(mean)); sb.append(","); sb.append(suffix); //String s = MEAN_PREFIX + "_" + taskID + "_" + Double.toString(_meanList[idx]._sum) + "," + suffix; return new DistinctValue(sb.toString(), -1L); } return null; } private DistinctValue prepMeanCorrectionOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException { MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]); if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) { sb.setLength(0); //CORRECTION_PREFIX + "_" + taskID + "_" + Double.toString(mean._correction); sb.append(CORRECTION_PREFIX); sb.append("_"); sb.append(taskID); sb.append("_"); double corr = (scnomv ? _scnomvMeanList[idx]._correction : _meanList[idx]._correction); sb.append(Double.toString(corr)); return new DistinctValue(sb.toString(), -1L); } return null; } private DistinctValue prepMeanCountOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException { MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]); if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) { sb.setLength(0); //s = COUNT_PREFIX + "_" + taskID + "_" + Long.toString(count); sb.append(COUNT_PREFIX); sb.append("_"); sb.append(taskID); sb.append("_"); long count = (scnomv ? _scnomvCountList[idx] : _countList[idx]); sb.append(Long.toString(count)); return new DistinctValue(sb.toString(), -1L); } return null; } private DistinctValue prepTotalCountOutput(int taskID, int idx, StringBuilder sb, boolean scnomv, TfUtils agents) throws CharacterCodingException { MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]); if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) { sb.setLength(0); //TOTAL_COUNT_PREFIX + "_" + taskID + "_" + Long.toString(TransformationAgent._numValidRecords); sb.append(TOTAL_COUNT_PREFIX); sb.append("_"); sb.append(taskID); sb.append("_"); sb.append(Long.toString(agents.getValid())); return new DistinctValue(sb.toString(), -1L); } return null; } private DistinctValue prepConstantOutput(int idx, StringBuilder sb) throws CharacterCodingException { if (_mvMethodList == null) return null; MVMethod mthd = _mvMethodList[idx]; if (mthd == MVMethod.CONSTANT) { sb.setLength(0); sb.append(CONSTANT_PREFIX); sb.append("_"); sb.append(_replacementList[idx]); return new DistinctValue(sb.toString(), -1); } return null; } private DistinctValue prepVarOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException { if (scnomv || _isMVScaled.get(idx) && _mvscMethodList[idx] == MVMethod.GLOBAL_MODE) { sb.setLength(0); sb.append(VARIANCE_PREFIX); sb.append("_"); sb.append(taskID); sb.append("_"); CM_COV_Object cm = (scnomv ? _scnomvVarList[idx] : _varList[idx]); sb.append(encodeCMObj(cm)); return new DistinctValue(sb.toString(), -1L); } return null; } private void outDV(IntWritable iw, DistinctValue dv, OutputCollector<IntWritable, DistinctValue> out) throws IOException { if (dv != null) out.collect(iw, dv); } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { try { StringBuilder sb = new StringBuilder(); DistinctValue dv = null; if (_colList != null) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; IntWritable iw = new IntWritable(-colID); dv = prepMeanOutput(taskID, i, sb, false); outDV(iw, dv, out); dv = prepMeanCorrectionOutput(taskID, i, sb, false); outDV(iw, dv, out); dv = prepMeanCountOutput(taskID, i, sb, false); outDV(iw, dv, out); dv = prepTotalCountOutput(taskID, i, sb, false, agents); outDV(iw, dv, out); dv = prepConstantOutput(i, sb); outDV(iw, dv, out); // output variance information relevant to scaling dv = prepVarOutput(taskID, i, sb, false); outDV(iw, dv, out); } // handle attributes that are scaled but not imputed if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) { int colID = _scnomvList[i]; IntWritable iw = new IntWritable(-colID); dv = prepMeanOutput(taskID, i, sb, true); outDV(iw, dv, out); dv = prepMeanCorrectionOutput(taskID, i, sb, true); outDV(iw, dv, out); dv = prepMeanCountOutput(taskID, i, sb, true); outDV(iw, dv, out); dv = prepTotalCountOutput(taskID, i, sb, true, agents); outDV(iw, dv, out); dv = prepVarOutput(taskID, i, sb, true); outDV(iw, dv, out); } } catch (Exception e) { throw new IOException(e); } } /** * Applicable when running on SPARK. * Helper function to output transformation metadata into shuffle. * * @param iw integer value * @param dv distinct value * @param list list of integer-distinct value pairs * @throws IOException if IOException occurs */ private void addDV(Integer iw, DistinctValue dv, ArrayList<Pair<Integer, DistinctValue>> list) throws IOException { if (dv != null) list.add(new Pair<Integer, DistinctValue>(iw, dv)); } public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException { try { StringBuilder sb = new StringBuilder(); DistinctValue dv = null; if (_colList != null) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; Integer iw = -colID; dv = prepMeanOutput(taskID, i, sb, false); addDV(iw, dv, list); dv = prepMeanCorrectionOutput(taskID, i, sb, false); addDV(iw, dv, list); dv = prepMeanCountOutput(taskID, i, sb, false); addDV(iw, dv, list); dv = prepTotalCountOutput(taskID, i, sb, false, agents); addDV(iw, dv, list); dv = prepConstantOutput(i, sb); addDV(iw, dv, list); // output variance information relevant to scaling dv = prepVarOutput(taskID, i, sb, false); addDV(iw, dv, list); } // handle attributes that are scaled but not imputed if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) { int colID = _scnomvList[i]; Integer iw = -colID; dv = prepMeanOutput(taskID, i, sb, true); addDV(iw, dv, list); dv = prepMeanCorrectionOutput(taskID, i, sb, true); addDV(iw, dv, list); dv = prepMeanCountOutput(taskID, i, sb, true); addDV(iw, dv, list); dv = prepTotalCountOutput(taskID, i, sb, true, agents); addDV(iw, dv, list); dv = prepVarOutput(taskID, i, sb, true); addDV(iw, dv, list); } } catch (Exception e) { throw new IOException(e); } return list; } // ---------------------------------------------------------------------------------------------------------- private void writeTfMtd(int colID, String mean, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir + "/Impute/" + agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX); try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br.write(colID + TfUtils.TXMTD_SEP + mean + "\n"); } } private void writeTfMtd(int colID, String mean, String sdev, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir + "/Scale/" + agents.getName(colID) + TfUtils.SCALE_FILE_SUFFIX); try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br.write(colID + TfUtils.TXMTD_SEP + mean + TfUtils.TXMTD_SEP + sdev + "\n"); } } private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX); try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) { br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth + TfUtils.TXMTD_SEP + nbins + "\n"); } } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { try { if (_colList != null) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; double imputedValue = Double.NaN; KahanObject gmean = null; if (_mvMethodList[i] == MVMethod.GLOBAL_MEAN) { gmean = _meanList[i]; imputedValue = _meanList[i]._sum; double mean = (_countList[i] == 0 ? 0.0 : _meanList[i]._sum); writeTfMtd(colID, Double.toString(mean), outputDir, fs, agents); } else if (_mvMethodList[i] == MVMethod.CONSTANT) { writeTfMtd(colID, _replacementList[i], outputDir, fs, agents); if (_isMVScaled.get(i)) { imputedValue = UtilFunctions.parseToDouble(_replacementList[i]); // adjust the global mean, by combining gmean with "replacement" (weight = #missing values) gmean = new KahanObject(_meanList[i]._sum, _meanList[i]._correction); _meanFn.execute(gmean, imputedValue, agents.getValid()); } } if (_isMVScaled.get(i)) { double sdev = -1.0; if (_mvscMethodList[i] == MVMethod.GLOBAL_MODE) { // Adjust variance with missing values long totalMissingCount = (agents.getValid() - _countList[i]); _varFn.execute(_varList[i], imputedValue, totalMissingCount); double var = _varList[i] .getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE)); sdev = Math.sqrt(var); } writeTfMtd(colID, Double.toString(gmean._sum), Double.toString(sdev), outputDir, fs, agents); } } if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) { int colID = _scnomvList[i]; double mean = (_scnomvCountList[i] == 0 ? 0.0 : _scnomvMeanList[i]._sum); double sdev = -1.0; if (_scnomvMethodList[i] == MVMethod.GLOBAL_MODE) { double var = _scnomvVarList[i] .getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE)); sdev = Math.sqrt(var); } writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents); } } catch (DMLRuntimeException e) { throw new IOException(e); } } /** * Method to merge map output transformation metadata. */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { double min = Double.MAX_VALUE; double max = -Double.MAX_VALUE; int nbins = 0; double d; long totalRecordCount = 0, totalValidCount = 0; String mvConstReplacement = null; DistinctValue val = new DistinctValue(); String w = null; class MeanObject { double mean, correction; long count; MeanObject() { } public String toString() { return mean + "," + correction + "," + count; } } ; HashMap<Integer, MeanObject> mapMeans = new HashMap<Integer, MeanObject>(); HashMap<Integer, CM_COV_Object> mapVars = new HashMap<Integer, CM_COV_Object>(); boolean isImputed = false; boolean isScaled = false; boolean isBinned = false; while (values.hasNext()) { val.reset(); val = values.next(); w = val.getWord(); if (w.startsWith(MEAN_PREFIX)) { String[] parts = w.split("_"); int taskID = UtilFunctions.parseToInt(parts[1]); MeanObject mo = mapMeans.get(taskID); if (mo == null) mo = new MeanObject(); mo.mean = UtilFunctions.parseToDouble(parts[2].split(",")[0]); // check if this attribute is scaled String s = parts[2].split(",")[1]; if (s.equalsIgnoreCase("scmv")) isScaled = isImputed = true; else if (s.equalsIgnoreCase("scnomv")) isScaled = true; else isImputed = true; mapMeans.put(taskID, mo); } else if (w.startsWith(CORRECTION_PREFIX)) { String[] parts = w.split("_"); int taskID = UtilFunctions.parseToInt(parts[1]); MeanObject mo = mapMeans.get(taskID); if (mo == null) mo = new MeanObject(); mo.correction = UtilFunctions.parseToDouble(parts[2]); mapMeans.put(taskID, mo); } else if (w.startsWith(CONSTANT_PREFIX)) { isImputed = true; String[] parts = w.split("_"); mvConstReplacement = parts[1]; } else if (w.startsWith(COUNT_PREFIX)) { String[] parts = w.split("_"); int taskID = UtilFunctions.parseToInt(parts[1]); MeanObject mo = mapMeans.get(taskID); if (mo == null) mo = new MeanObject(); mo.count = UtilFunctions.parseToLong(parts[2]); totalValidCount += mo.count; mapMeans.put(taskID, mo); } else if (w.startsWith(TOTAL_COUNT_PREFIX)) { String[] parts = w.split("_"); //int taskID = UtilFunctions.parseToInt(parts[1]); totalRecordCount += UtilFunctions.parseToLong(parts[2]); } else if (w.startsWith(VARIANCE_PREFIX)) { isScaled = true; String[] parts = w.split("_"); int taskID = UtilFunctions.parseToInt(parts[1]); CM_COV_Object cm = decodeCMObj(parts[2]); mapVars.put(taskID, cm); } else if (w.startsWith(BinAgent.MIN_PREFIX)) { isBinned = true; d = UtilFunctions.parseToDouble(w.substring(BinAgent.MIN_PREFIX.length())); if (d < min) min = d; } else if (w.startsWith(BinAgent.MAX_PREFIX)) { isBinned = true; d = UtilFunctions.parseToDouble(w.substring(BinAgent.MAX_PREFIX.length())); if (d > max) max = d; } else if (w.startsWith(BinAgent.NBINS_PREFIX)) { isBinned = true; nbins = (int) UtilFunctions.parseToLong(w.substring(BinAgent.NBINS_PREFIX.length())); } else throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w); } // compute global mean across all map outputs KahanObject gmean = new KahanObject(0, 0); KahanPlus kp = KahanPlus.getKahanPlusFnObject(); long gcount = 0; for (MeanObject mo : mapMeans.values()) { gcount = gcount + mo.count; if (gcount > 0) { double delta = mo.mean - gmean._sum; kp.execute2(gmean, delta * mo.count / gcount); //_meanFn.execute2(gmean, mo.mean*mo.count, gcount); } } // compute global variance across all map outputs CM_COV_Object gcm = new CM_COV_Object(); try { for (CM_COV_Object cm : mapVars.values()) gcm = (CM_COV_Object) _varFn.execute(gcm, cm); } catch (DMLRuntimeException e) { throw new IOException(e); } // If the column is imputed with a constant, then adjust min and max based the value of the constant. if (isImputed && isBinned && mvConstReplacement != null) { double cst = UtilFunctions.parseToDouble(mvConstReplacement); if (cst < min) min = cst; if (cst > max) max = cst; } // write merged metadata if (isImputed) { String imputedValue = null; if (mvConstReplacement != null) imputedValue = mvConstReplacement; else imputedValue = Double.toString(gcount == 0 ? 0.0 : gmean._sum); writeTfMtd(colID, imputedValue, outputDir, fs, agents); } if (isBinned) { double binwidth = (max - min) / nbins; writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents); } if (isScaled) { try { if (totalValidCount != totalRecordCount) { // In the presence of missing values, the variance needs to be adjusted. // The mean does not need to be adjusted, when mv impute method is global_mean, // since missing values themselves are replaced with gmean. long totalMissingCount = (totalRecordCount - totalValidCount); int idx = isApplicable(colID); if (idx != -1 && _mvMethodList[idx] == MVMethod.CONSTANT) _meanFn.execute(gmean, UtilFunctions.parseToDouble(_replacementList[idx]), totalRecordCount); _varFn.execute(gcm, gmean._sum, totalMissingCount); } double mean = (gcount == 0 ? 0.0 : gmean._sum); double var = gcm.getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE)); double sdev = (mapVars.size() > 0 ? Math.sqrt(var) : -1.0); writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents); } catch (DMLRuntimeException e) { throw new IOException(e); } } } // ------------------------------------------------------------------------------------------------ private String readReplacement(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { Path path = new Path(txMtdDir + "/Impute/" + agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); String replacement = null; try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)))) { String line = br.readLine(); replacement = UtilFunctions.unquote(line.split(TfUtils.TXMTD_SEP)[1]); } return replacement; } public String readScaleLine(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { Path path = new Path(txMtdDir + "/Scale/" + agents.getName(colID) + TfUtils.SCALE_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); String line = null; try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)))) { line = br.readLine(); } return line; } private void processScalingFile(int i, int[] list, KahanObject[] meanList, CM_COV_Object[] varList, FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException { int colID = list[i]; String line = readScaleLine(colID, fs, tfMtdDir, agents); String[] parts = line.split(","); double mean = UtilFunctions.parseToDouble(parts[1]); double sd = UtilFunctions.parseToDouble(parts[2]); meanList[i]._sum = mean; varList[i].mean._sum = sd; } // ------------------------------------------------------------------------------------------------ /** * Method to load transform metadata for all attributes */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException { if (fs.isDirectory(tfMtdDir)) { // Load information about missing value imputation if (_colList != null) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; if (_mvMethodList[i] == MVMethod.GLOBAL_MEAN || _mvMethodList[i] == MVMethod.GLOBAL_MODE) // global_mean or global_mode _replacementList[i] = readReplacement(colID, fs, tfMtdDir, agents); else if (_mvMethodList[i] == MVMethod.CONSTANT) { // constant: replace a missing value by a given constant // nothing to do. The constant values are loaded already during configure } else throw new RuntimeException("Invalid Missing Value Imputation methods: " + _mvMethodList[i]); } // Load scaling information if (_colList != null) for (int i = 0; i < _colList.length; i++) if (_isMVScaled.get(i)) processScalingFile(i, _colList, _meanList, _varList, fs, tfMtdDir, agents); if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) processScalingFile(i, _scnomvList, _scnomvMeanList, _scnomvVarList, fs, tfMtdDir, agents); } else { throw new RuntimeException("Path to recode maps must be a directory: " + tfMtdDir); } } public MVMethod getMethod(int colID) { int idx = isApplicable(colID); if (idx == -1) return MVMethod.INVALID; else return _mvMethodList[idx]; } public long getNonMVCount(int colID) { int idx = isApplicable(colID); return (idx == -1) ? 0 : _countList[idx]; } public String getReplacement(int colID) { int idx = isApplicable(colID); return (idx == -1) ? null : _replacementList[idx]; } @Override public MatrixBlock encode(FrameBlock in, MatrixBlock out) { build(in); return apply(in, out); } @Override public void build(FrameBlock in) { try { for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; if (_mvMethodList[j] == MVMethod.GLOBAL_MEAN) { //compute global column mean (scale) long off = _countList[j]; for (int i = 0; i < in.getNumRows(); i++) _meanFn.execute2(_meanList[j], UtilFunctions.objectToDouble(in.getSchema()[colID - 1], in.get(i, colID - 1)), off + i + 1); _replacementList[j] = String.valueOf(_meanList[j]._sum); _countList[j] += in.getNumRows(); } else if (_mvMethodList[j] == MVMethod.GLOBAL_MODE) { //compute global column mode (categorical), i.e., most frequent category HashMap<String, Long> hist = _hist.containsKey(colID) ? _hist.get(colID) : new HashMap<String, Long>(); for (int i = 0; i < in.getNumRows(); i++) { String key = String.valueOf(in.get(i, colID - 1)); if (key != null && !key.isEmpty()) { Long val = hist.get(key); hist.put(key, (val != null) ? val + 1 : 1); } } _hist.put(colID, hist); long max = Long.MIN_VALUE; for (Entry<String, Long> e : hist.entrySet()) if (e.getValue() > max) { _replacementList[j] = e.getKey(); max = e.getValue(); } } } } catch (Exception ex) { throw new RuntimeException(ex); } } @Override public String[] apply(String[] words) { if (isApplicable()) for (int i = 0; i < _colList.length; i++) { int colID = _colList[i]; String w = UtilFunctions.unquote(words[colID - 1]); if (TfUtils.isNA(_NAstrings, w)) w = words[colID - 1] = _replacementList[i]; if (_isMVScaled.get(i)) if (_mvscMethodList[i] == MVMethod.GLOBAL_MEAN) words[colID - 1] = Double.toString(UtilFunctions.parseToDouble(w) - _meanList[i]._sum); else words[colID - 1] = Double.toString( (UtilFunctions.parseToDouble(w) - _meanList[i]._sum) / _varList[i].mean._sum); } if (_scnomvList != null) for (int i = 0; i < _scnomvList.length; i++) { int colID = _scnomvList[i]; if (_scnomvMethodList[i] == MVMethod.GLOBAL_MEAN) words[colID - 1] = Double .toString(UtilFunctions.parseToDouble(words[colID - 1]) - _scnomvMeanList[i]._sum); else words[colID - 1] = Double .toString((UtilFunctions.parseToDouble(words[colID - 1]) - _scnomvMeanList[i]._sum) / _scnomvVarList[i].mean._sum); } return words; } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { for (int i = 0; i < in.getNumRows(); i++) { for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; if (Double.isNaN(out.quickGetValue(i, colID - 1))) out.quickSetValue(i, colID - 1, Double.parseDouble(_replacementList[j])); } } return out; } @Override public FrameBlock getMetaData(FrameBlock out) { for (int j = 0; j < _colList.length; j++) { out.getColumnMetadata(_colList[j] - 1).setMvValue(_replacementList[j]); } return out; } public void initMetaData(FrameBlock meta) { //init replacement lists, replace recoded values to //apply mv imputation potentially after recoding for (int j = 0; j < _colList.length; j++) { int colID = _colList[j]; String mvVal = UtilFunctions.unquote(meta.getColumnMetadata(colID - 1).getMvValue()); if (_rcList.contains(colID)) { Long mvVal2 = meta.getRecodeMap(colID - 1).get(mvVal); if (mvVal2 == null) throw new RuntimeException( "Missing recode value for impute value '" + mvVal + "' (colID=" + colID + ")."); _replacementList[j] = mvVal2.toString(); } else { _replacementList[j] = mvVal; } } } public void initRecodeIDList(List<Integer> rcList) { _rcList = rcList; } /** * Exposes the internal histogram after build. * * @param colID column ID * @return histogram (map of string keys and long values) */ public HashMap<String, Long> getHistogram(int colID) { return _hist.get(colID); } }