org.apache.sysml.runtime.transform.MVImputeAgent.java Source code

Introduction

Here is the source code for org.apache.sysml.runtime.transform.MVImputeAgent.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.transform;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysml.runtime.instructions.cp.KahanObject;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;

public class MVImputeAgent extends Encoder {
    private static final long serialVersionUID = 9057868620144662194L;

    public static final String MEAN_PREFIX = "mean";
    public static final String VARIANCE_PREFIX = "var";
    public static final String CORRECTION_PREFIX = "correction";
    public static final String COUNT_PREFIX = "validcount"; // #of valid or non-missing values in a column
    public static final String TOTAL_COUNT_PREFIX = "totalcount"; // #of total records processed by a mapper
    public static final String CONSTANT_PREFIX = "constant";

    public enum MVMethod {
        INVALID, GLOBAL_MEAN, GLOBAL_MODE, CONSTANT
    };

    private MVMethod[] _mvMethodList = null;
    private MVMethod[] _mvscMethodList = null; // scaling methods for attributes that are imputed and also scaled

    private BitSet _isMVScaled = null;
    private CM _varFn = CM.getCMFnObject(AggregateOperationTypes.VARIANCE); // function object that understands variance computation

    // objects required to compute mean and variance of all non-missing entries 
    private Mean _meanFn = Mean.getMeanFnObject(); // function object that understands mean computation
    private KahanObject[] _meanList = null; // column-level means, computed so far
    private long[] _countList = null; // #of non-missing values

    private CM_COV_Object[] _varList = null; // column-level variances, computed so far (for scaling)

    private int[] _scnomvList = null; // List of attributes that are scaled but not imputed
    private MVMethod[] _scnomvMethodList = null; // scaling methods: 0 for invalid; 1 for mean-subtraction; 2 for z-scoring
    private KahanObject[] _scnomvMeanList = null; // column-level means, for attributes scaled but not imputed
    private long[] _scnomvCountList = null; // #of non-missing values, for attributes scaled but not imputed
    private CM_COV_Object[] _scnomvVarList = null; // column-level variances, computed so far

    private String[] _replacementList = null; // replacements: for global_mean, mean; and for global_mode, recode id of mode category
    private String[] _NAstrings = null;
    private List<Integer> _rcList = null;
    private HashMap<Integer, HashMap<String, Long>> _hist = null;

    public String[] getReplacements() {
        return _replacementList;
    }

    public KahanObject[] getMeans() {
        return _meanList;
    }

    public CM_COV_Object[] getVars() {
        return _varList;
    }

    public KahanObject[] getMeans_scnomv() {
        return _scnomvMeanList;
    }

    public CM_COV_Object[] getVars_scnomv() {
        return _scnomvVarList;
    }

    public MVImputeAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException {
        super(null, clen);

        //handle column list
        int[] collist = TfMetaUtils.parseJsonObjectIDList(parsedSpec, colnames, TfUtils.TXMETHOD_IMPUTE);
        initColList(collist);

        //handle method list
        parseMethodsAndReplacments(parsedSpec);

        //create reuse histograms
        _hist = new HashMap<Integer, HashMap<String, Long>>();
    }

    public MVImputeAgent(JSONObject parsedSpec, String[] colnames, String[] NAstrings, int clen)
            throws JSONException {
        super(null, clen);
        boolean isMV = parsedSpec.containsKey(TfUtils.TXMETHOD_IMPUTE);
        boolean isSC = parsedSpec.containsKey(TfUtils.TXMETHOD_SCALE);
        _NAstrings = NAstrings;

        if (!isMV) {
            // MV Impute is not applicable
            _colList = null;
            _mvMethodList = null;
            _meanList = null;
            _countList = null;
            _replacementList = null;
        } else {
            JSONObject mvobj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE);
            JSONArray mvattrs = (JSONArray) mvobj.get(TfUtils.JSON_ATTRS);
            JSONArray mvmthds = (JSONArray) mvobj.get(TfUtils.JSON_MTHD);
            int mvLength = mvattrs.size();

            _colList = new int[mvLength];
            _mvMethodList = new MVMethod[mvLength];

            _meanList = new KahanObject[mvLength];
            _countList = new long[mvLength];
            _varList = new CM_COV_Object[mvLength];

            _isMVScaled = new BitSet(_colList.length);
            _isMVScaled.clear();

            for (int i = 0; i < _colList.length; i++) {
                _colList[i] = UtilFunctions.toInt(mvattrs.get(i));
                _mvMethodList[i] = MVMethod.values()[UtilFunctions.toInt(mvmthds.get(i))];
                _meanList[i] = new KahanObject(0, 0);
            }

            _replacementList = new String[mvLength]; // contains replacements for all columns (scale and categorical)

            JSONArray constants = (JSONArray) mvobj.get(TfUtils.JSON_CONSTS);
            for (int i = 0; i < constants.size(); i++) {
                if (constants.get(i) == null)
                    _replacementList[i] = "NaN";
                else
                    _replacementList[i] = constants.get(i).toString();
            }
        }

        // Handle scaled attributes
        if (!isSC) {
            // scaling is not applicable
            _scnomvCountList = null;
            _scnomvMeanList = null;
            _scnomvVarList = null;
        } else {
            if (_colList != null)
                _mvscMethodList = new MVMethod[_colList.length];

            JSONObject scobj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_SCALE);
            JSONArray scattrs = (JSONArray) scobj.get(TfUtils.JSON_ATTRS);
            JSONArray scmthds = (JSONArray) scobj.get(TfUtils.JSON_MTHD);
            int scLength = scattrs.size();

            int[] _allscaled = new int[scLength];
            int scnomv = 0, colID;
            byte mthd;
            for (int i = 0; i < scLength; i++) {
                colID = UtilFunctions.toInt(scattrs.get(i));
                mthd = (byte) UtilFunctions.toInt(scmthds.get(i));

                _allscaled[i] = colID;

                // check if the attribute is also MV imputed
                int mvidx = isApplicable(colID);
                if (mvidx != -1) {
                    _isMVScaled.set(mvidx);
                    _mvscMethodList[mvidx] = MVMethod.values()[mthd];
                    _varList[mvidx] = new CM_COV_Object();
                } else
                    scnomv++; // count of scaled but not imputed 
            }

            if (scnomv > 0) {
                _scnomvList = new int[scnomv];
                _scnomvMethodList = new MVMethod[scnomv];

                _scnomvMeanList = new KahanObject[scnomv];
                _scnomvCountList = new long[scnomv];
                _scnomvVarList = new CM_COV_Object[scnomv];

                for (int i = 0, idx = 0; i < scLength; i++) {
                    colID = UtilFunctions.toInt(scattrs.get(i));
                    mthd = (byte) UtilFunctions.toInt(scmthds.get(i));

                    if (isApplicable(colID) == -1) { // scaled but not imputed
                        _scnomvList[idx] = colID;
                        _scnomvMethodList[idx] = MVMethod.values()[mthd];
                        _scnomvMeanList[idx] = new KahanObject(0, 0);
                        _scnomvVarList[idx] = new CM_COV_Object();
                        idx++;
                    }
                }
            }
        }
    }

    private void parseMethodsAndReplacments(JSONObject parsedSpec) throws JSONException {
        JSONArray mvspec = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_IMPUTE);
        _mvMethodList = new MVMethod[mvspec.size()];
        _replacementList = new String[mvspec.size()];
        _meanList = new KahanObject[mvspec.size()];
        _countList = new long[mvspec.size()];
        for (int i = 0; i < mvspec.size(); i++) {
            JSONObject mvobj = (JSONObject) mvspec.get(i);
            _mvMethodList[i] = MVMethod.valueOf(mvobj.get("method").toString().toUpperCase());
            if (_mvMethodList[i] == MVMethod.CONSTANT) {
                _replacementList[i] = mvobj.getString("value").toString();
            }
            _meanList[i] = new KahanObject(0, 0);
        }
    }

    public void prepare(String[] words) throws IOException {

        try {
            String w = null;
            if (_colList != null)
                for (int i = 0; i < _colList.length; i++) {
                    int colID = _colList[i];
                    w = UtilFunctions.unquote(words[colID - 1].trim());

                    try {
                        if (!TfUtils.isNA(_NAstrings, w)) {
                            _countList[i]++;

                            boolean computeMean = (_mvMethodList[i] == MVMethod.GLOBAL_MEAN || _isMVScaled.get(i));
                            if (computeMean) {
                                // global_mean
                                double d = UtilFunctions.parseToDouble(w);
                                _meanFn.execute2(_meanList[i], d, _countList[i]);

                                if (_isMVScaled.get(i) && _mvscMethodList[i] == MVMethod.GLOBAL_MODE)
                                    _varFn.execute(_varList[i], d);
                            } else {
                                // global_mode or constant
                                // Nothing to do here. Mode is computed using recode maps.
                            }
                        }
                    } catch (NumberFormatException e) {
                        throw new RuntimeException("Encountered \"" + w + "\" in column ID \"" + colID
                                + "\", when expecting a numeric value. Consider adding \"" + w
                                + "\" to na.strings, along with an appropriate imputation method.");
                    }
                }

            // Compute mean and variance for attributes that are scaled but not imputed
            if (_scnomvList != null)
                for (int i = 0; i < _scnomvList.length; i++) {
                    int colID = _scnomvList[i];
                    w = UtilFunctions.unquote(words[colID - 1].trim());
                    double d = UtilFunctions.parseToDouble(w);
                    _scnomvCountList[i]++; // not required, this is always equal to total #records processed
                    _meanFn.execute2(_scnomvMeanList[i], d, _scnomvCountList[i]);
                    if (_scnomvMethodList[i] == MVMethod.GLOBAL_MODE)
                        _varFn.execute(_scnomvVarList[i], d);
                }
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    // ----------------------------------------------------------------------------------------------------------

    private String encodeCMObj(CM_COV_Object obj) {
        StringBuilder sb = new StringBuilder();
        sb.append(obj.w);
        sb.append(",");
        sb.append(obj.mean._sum);
        sb.append(",");
        sb.append(obj.mean._correction);
        sb.append(",");
        sb.append(obj.m2._sum);
        sb.append(",");
        sb.append(obj.m2._correction);
        return sb.toString();
    }

    private CM_COV_Object decodeCMObj(String s) {
        CM_COV_Object obj = new CM_COV_Object();
        String[] parts = s.split(",");
        obj.w = UtilFunctions.parseToDouble(parts[0]);
        obj.mean._sum = UtilFunctions.parseToDouble(parts[1]);
        obj.mean._correction = UtilFunctions.parseToDouble(parts[2]);
        obj.m2._sum = UtilFunctions.parseToDouble(parts[3]);
        obj.m2._correction = UtilFunctions.parseToDouble(parts[4]);

        return obj;
    }

    private DistinctValue prepMeanOutput(int taskID, int idx, StringBuilder sb, boolean scnomv)
            throws CharacterCodingException {

        MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]);

        if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) {
            String suffix = null;
            if (scnomv)
                suffix = "scnomv";
            else if (mthd == MVMethod.GLOBAL_MEAN && _isMVScaled.get(idx))
                suffix = "scmv"; // both scaled and mv imputed
            else if (mthd == MVMethod.GLOBAL_MEAN)
                suffix = "noscmv";
            else
                suffix = "scnomv";

            sb.setLength(0);
            sb.append(MEAN_PREFIX);
            sb.append("_");
            sb.append(taskID);
            sb.append("_");
            double mean = (scnomv ? _scnomvMeanList[idx]._sum : _meanList[idx]._sum);
            sb.append(Double.toString(mean));
            sb.append(",");
            sb.append(suffix);
            //String s = MEAN_PREFIX + "_" + taskID + "_" + Double.toString(_meanList[idx]._sum) + "," + suffix;
            return new DistinctValue(sb.toString(), -1L);
        }

        return null;
    }

    private DistinctValue prepMeanCorrectionOutput(int taskID, int idx, StringBuilder sb, boolean scnomv)
            throws CharacterCodingException {
        MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]);
        if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) {
            sb.setLength(0);
            //CORRECTION_PREFIX + "_" + taskID + "_" + Double.toString(mean._correction);
            sb.append(CORRECTION_PREFIX);
            sb.append("_");
            sb.append(taskID);
            sb.append("_");
            double corr = (scnomv ? _scnomvMeanList[idx]._correction : _meanList[idx]._correction);
            sb.append(Double.toString(corr));
            return new DistinctValue(sb.toString(), -1L);
        }
        return null;
    }

    private DistinctValue prepMeanCountOutput(int taskID, int idx, StringBuilder sb, boolean scnomv)
            throws CharacterCodingException {
        MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]);
        if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) {
            sb.setLength(0);
            //s = COUNT_PREFIX + "_" + taskID + "_" + Long.toString(count);
            sb.append(COUNT_PREFIX);
            sb.append("_");
            sb.append(taskID);
            sb.append("_");
            long count = (scnomv ? _scnomvCountList[idx] : _countList[idx]);
            sb.append(Long.toString(count));
            return new DistinctValue(sb.toString(), -1L);
        }
        return null;
    }

    private DistinctValue prepTotalCountOutput(int taskID, int idx, StringBuilder sb, boolean scnomv,
            TfUtils agents) throws CharacterCodingException {
        MVMethod mthd = (scnomv ? _scnomvMethodList[idx] : _mvMethodList[idx]);
        if (scnomv || mthd == MVMethod.GLOBAL_MEAN || _isMVScaled.get(idx)) {
            sb.setLength(0);
            //TOTAL_COUNT_PREFIX + "_" + taskID + "_" + Long.toString(TransformationAgent._numValidRecords);
            sb.append(TOTAL_COUNT_PREFIX);
            sb.append("_");
            sb.append(taskID);
            sb.append("_");
            sb.append(Long.toString(agents.getValid()));
            return new DistinctValue(sb.toString(), -1L);
        }
        return null;
    }

    private DistinctValue prepConstantOutput(int idx, StringBuilder sb) throws CharacterCodingException {
        if (_mvMethodList == null)
            return null;
        MVMethod mthd = _mvMethodList[idx];
        if (mthd == MVMethod.CONSTANT) {
            sb.setLength(0);
            sb.append(CONSTANT_PREFIX);
            sb.append("_");
            sb.append(_replacementList[idx]);
            return new DistinctValue(sb.toString(), -1);
        }
        return null;
    }

    private DistinctValue prepVarOutput(int taskID, int idx, StringBuilder sb, boolean scnomv)
            throws CharacterCodingException {
        if (scnomv || _isMVScaled.get(idx) && _mvscMethodList[idx] == MVMethod.GLOBAL_MODE) {
            sb.setLength(0);
            sb.append(VARIANCE_PREFIX);
            sb.append("_");
            sb.append(taskID);
            sb.append("_");
            CM_COV_Object cm = (scnomv ? _scnomvVarList[idx] : _varList[idx]);
            sb.append(encodeCMObj(cm));

            return new DistinctValue(sb.toString(), -1L);
        }
        return null;
    }

    private void outDV(IntWritable iw, DistinctValue dv, OutputCollector<IntWritable, DistinctValue> out)
            throws IOException {
        if (dv != null)
            out.collect(iw, dv);
    }

    /**
     * Method to output transformation metadata from the mappers. 
     * This information is collected and merged by the reducers.
     */
    @Override
    public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID,
            TfUtils agents) throws IOException {
        try {
            StringBuilder sb = new StringBuilder();
            DistinctValue dv = null;

            if (_colList != null)
                for (int i = 0; i < _colList.length; i++) {
                    int colID = _colList[i];
                    IntWritable iw = new IntWritable(-colID);

                    dv = prepMeanOutput(taskID, i, sb, false);
                    outDV(iw, dv, out);
                    dv = prepMeanCorrectionOutput(taskID, i, sb, false);
                    outDV(iw, dv, out);
                    dv = prepMeanCountOutput(taskID, i, sb, false);
                    outDV(iw, dv, out);
                    dv = prepTotalCountOutput(taskID, i, sb, false, agents);
                    outDV(iw, dv, out);

                    dv = prepConstantOutput(i, sb);
                    outDV(iw, dv, out);

                    // output variance information relevant to scaling
                    dv = prepVarOutput(taskID, i, sb, false);
                    outDV(iw, dv, out);
                }

            // handle attributes that are scaled but not imputed
            if (_scnomvList != null)
                for (int i = 0; i < _scnomvList.length; i++) {
                    int colID = _scnomvList[i];
                    IntWritable iw = new IntWritable(-colID);

                    dv = prepMeanOutput(taskID, i, sb, true);
                    outDV(iw, dv, out);
                    dv = prepMeanCorrectionOutput(taskID, i, sb, true);
                    outDV(iw, dv, out);
                    dv = prepMeanCountOutput(taskID, i, sb, true);
                    outDV(iw, dv, out);
                    dv = prepTotalCountOutput(taskID, i, sb, true, agents);
                    outDV(iw, dv, out);

                    dv = prepVarOutput(taskID, i, sb, true);
                    outDV(iw, dv, out);
                }
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    /**
     * Applicable when running on SPARK.
     * Helper function to output transformation metadata into shuffle.
     * 
     * @param iw integer value
     * @param dv distinct value
     * @param list list of integer-distinct value pairs
     * @throws IOException if IOException occurs
     */
    private void addDV(Integer iw, DistinctValue dv, ArrayList<Pair<Integer, DistinctValue>> list)
            throws IOException {
        if (dv != null)
            list.add(new Pair<Integer, DistinctValue>(iw, dv));
    }

    public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID,
            ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
        try {
            StringBuilder sb = new StringBuilder();
            DistinctValue dv = null;

            if (_colList != null)
                for (int i = 0; i < _colList.length; i++) {
                    int colID = _colList[i];
                    Integer iw = -colID;

                    dv = prepMeanOutput(taskID, i, sb, false);
                    addDV(iw, dv, list);
                    dv = prepMeanCorrectionOutput(taskID, i, sb, false);
                    addDV(iw, dv, list);
                    dv = prepMeanCountOutput(taskID, i, sb, false);
                    addDV(iw, dv, list);
                    dv = prepTotalCountOutput(taskID, i, sb, false, agents);
                    addDV(iw, dv, list);

                    dv = prepConstantOutput(i, sb);
                    addDV(iw, dv, list);

                    // output variance information relevant to scaling
                    dv = prepVarOutput(taskID, i, sb, false);
                    addDV(iw, dv, list);
                }

            // handle attributes that are scaled but not imputed
            if (_scnomvList != null)
                for (int i = 0; i < _scnomvList.length; i++) {
                    int colID = _scnomvList[i];
                    Integer iw = -colID;

                    dv = prepMeanOutput(taskID, i, sb, true);
                    addDV(iw, dv, list);
                    dv = prepMeanCorrectionOutput(taskID, i, sb, true);
                    addDV(iw, dv, list);
                    dv = prepMeanCountOutput(taskID, i, sb, true);
                    addDV(iw, dv, list);
                    dv = prepTotalCountOutput(taskID, i, sb, true, agents);
                    addDV(iw, dv, list);

                    dv = prepVarOutput(taskID, i, sb, true);
                    addDV(iw, dv, list);
                }
        } catch (Exception e) {
            throw new IOException(e);
        }
        return list;
    }

    // ----------------------------------------------------------------------------------------------------------

    private void writeTfMtd(int colID, String mean, String tfMtdDir, FileSystem fs, TfUtils agents)
            throws IOException {
        Path pt = new Path(tfMtdDir + "/Impute/" + agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX);
        try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) {
            br.write(colID + TfUtils.TXMTD_SEP + mean + "\n");
        }
    }

    private void writeTfMtd(int colID, String mean, String sdev, String tfMtdDir, FileSystem fs, TfUtils agents)
            throws IOException {
        Path pt = new Path(tfMtdDir + "/Scale/" + agents.getName(colID) + TfUtils.SCALE_FILE_SUFFIX);
        try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) {
            br.write(colID + TfUtils.TXMTD_SEP + mean + TfUtils.TXMTD_SEP + sdev + "\n");
        }
    }

    private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir,
            FileSystem fs, TfUtils agents) throws IOException {
        Path pt = new Path(tfMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
        try (BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)))) {
            br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth
                    + TfUtils.TXMTD_SEP + nbins + "\n");
        }
    }

    public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {

        try {
            if (_colList != null)
                for (int i = 0; i < _colList.length; i++) {
                    int colID = _colList[i];

                    double imputedValue = Double.NaN;
                    KahanObject gmean = null;
                    if (_mvMethodList[i] == MVMethod.GLOBAL_MEAN) {
                        gmean = _meanList[i];
                        imputedValue = _meanList[i]._sum;

                        double mean = (_countList[i] == 0 ? 0.0 : _meanList[i]._sum);
                        writeTfMtd(colID, Double.toString(mean), outputDir, fs, agents);
                    } else if (_mvMethodList[i] == MVMethod.CONSTANT) {
                        writeTfMtd(colID, _replacementList[i], outputDir, fs, agents);

                        if (_isMVScaled.get(i)) {
                            imputedValue = UtilFunctions.parseToDouble(_replacementList[i]);
                            // adjust the global mean, by combining gmean with "replacement" (weight = #missing values)
                            gmean = new KahanObject(_meanList[i]._sum, _meanList[i]._correction);
                            _meanFn.execute(gmean, imputedValue, agents.getValid());
                        }
                    }

                    if (_isMVScaled.get(i)) {
                        double sdev = -1.0;
                        if (_mvscMethodList[i] == MVMethod.GLOBAL_MODE) {
                            // Adjust variance with missing values
                            long totalMissingCount = (agents.getValid() - _countList[i]);
                            _varFn.execute(_varList[i], imputedValue, totalMissingCount);
                            double var = _varList[i]
                                    .getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
                            sdev = Math.sqrt(var);
                        }
                        writeTfMtd(colID, Double.toString(gmean._sum), Double.toString(sdev), outputDir, fs,
                                agents);
                    }
                }

            if (_scnomvList != null)
                for (int i = 0; i < _scnomvList.length; i++) {
                    int colID = _scnomvList[i];
                    double mean = (_scnomvCountList[i] == 0 ? 0.0 : _scnomvMeanList[i]._sum);
                    double sdev = -1.0;
                    if (_scnomvMethodList[i] == MVMethod.GLOBAL_MODE) {
                        double var = _scnomvVarList[i]
                                .getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
                        sdev = Math.sqrt(var);
                    }
                    writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);
                }

        } catch (DMLRuntimeException e) {
            throw new IOException(e);
        }
    }

    /** 
     * Method to merge map output transformation metadata. 
     */
    @Override
    public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID,
            FileSystem fs, TfUtils agents) throws IOException {
        double min = Double.MAX_VALUE;
        double max = -Double.MAX_VALUE;
        int nbins = 0;
        double d;
        long totalRecordCount = 0, totalValidCount = 0;
        String mvConstReplacement = null;

        DistinctValue val = new DistinctValue();
        String w = null;

        class MeanObject {
            double mean, correction;
            long count;

            MeanObject() {
            }

            public String toString() {
                return mean + "," + correction + "," + count;
            }
        }
        ;
        HashMap<Integer, MeanObject> mapMeans = new HashMap<Integer, MeanObject>();
        HashMap<Integer, CM_COV_Object> mapVars = new HashMap<Integer, CM_COV_Object>();
        boolean isImputed = false;
        boolean isScaled = false;
        boolean isBinned = false;

        while (values.hasNext()) {
            val.reset();
            val = values.next();
            w = val.getWord();

            if (w.startsWith(MEAN_PREFIX)) {
                String[] parts = w.split("_");
                int taskID = UtilFunctions.parseToInt(parts[1]);
                MeanObject mo = mapMeans.get(taskID);
                if (mo == null)
                    mo = new MeanObject();

                mo.mean = UtilFunctions.parseToDouble(parts[2].split(",")[0]);

                // check if this attribute is scaled
                String s = parts[2].split(",")[1];
                if (s.equalsIgnoreCase("scmv"))
                    isScaled = isImputed = true;
                else if (s.equalsIgnoreCase("scnomv"))
                    isScaled = true;
                else
                    isImputed = true;

                mapMeans.put(taskID, mo);
            } else if (w.startsWith(CORRECTION_PREFIX)) {
                String[] parts = w.split("_");
                int taskID = UtilFunctions.parseToInt(parts[1]);
                MeanObject mo = mapMeans.get(taskID);
                if (mo == null)
                    mo = new MeanObject();
                mo.correction = UtilFunctions.parseToDouble(parts[2]);
                mapMeans.put(taskID, mo);
            } else if (w.startsWith(CONSTANT_PREFIX)) {
                isImputed = true;
                String[] parts = w.split("_");
                mvConstReplacement = parts[1];
            } else if (w.startsWith(COUNT_PREFIX)) {
                String[] parts = w.split("_");
                int taskID = UtilFunctions.parseToInt(parts[1]);
                MeanObject mo = mapMeans.get(taskID);
                if (mo == null)
                    mo = new MeanObject();
                mo.count = UtilFunctions.parseToLong(parts[2]);
                totalValidCount += mo.count;
                mapMeans.put(taskID, mo);
            } else if (w.startsWith(TOTAL_COUNT_PREFIX)) {
                String[] parts = w.split("_");
                //int taskID = UtilFunctions.parseToInt(parts[1]);
                totalRecordCount += UtilFunctions.parseToLong(parts[2]);
            } else if (w.startsWith(VARIANCE_PREFIX)) {
                isScaled = true;
                String[] parts = w.split("_");
                int taskID = UtilFunctions.parseToInt(parts[1]);
                CM_COV_Object cm = decodeCMObj(parts[2]);
                mapVars.put(taskID, cm);
            } else if (w.startsWith(BinAgent.MIN_PREFIX)) {
                isBinned = true;
                d = UtilFunctions.parseToDouble(w.substring(BinAgent.MIN_PREFIX.length()));
                if (d < min)
                    min = d;
            } else if (w.startsWith(BinAgent.MAX_PREFIX)) {
                isBinned = true;
                d = UtilFunctions.parseToDouble(w.substring(BinAgent.MAX_PREFIX.length()));
                if (d > max)
                    max = d;
            } else if (w.startsWith(BinAgent.NBINS_PREFIX)) {
                isBinned = true;
                nbins = (int) UtilFunctions.parseToLong(w.substring(BinAgent.NBINS_PREFIX.length()));
            } else
                throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
        }

        // compute global mean across all map outputs
        KahanObject gmean = new KahanObject(0, 0);
        KahanPlus kp = KahanPlus.getKahanPlusFnObject();
        long gcount = 0;
        for (MeanObject mo : mapMeans.values()) {
            gcount = gcount + mo.count;
            if (gcount > 0) {
                double delta = mo.mean - gmean._sum;
                kp.execute2(gmean, delta * mo.count / gcount);
                //_meanFn.execute2(gmean, mo.mean*mo.count, gcount);
            }
        }

        // compute global variance across all map outputs
        CM_COV_Object gcm = new CM_COV_Object();
        try {
            for (CM_COV_Object cm : mapVars.values())
                gcm = (CM_COV_Object) _varFn.execute(gcm, cm);
        } catch (DMLRuntimeException e) {
            throw new IOException(e);
        }

        // If the column is imputed with a constant, then adjust min and max based the value of the constant.
        if (isImputed && isBinned && mvConstReplacement != null) {
            double cst = UtilFunctions.parseToDouble(mvConstReplacement);
            if (cst < min)
                min = cst;
            if (cst > max)
                max = cst;
        }

        // write merged metadata
        if (isImputed) {
            String imputedValue = null;
            if (mvConstReplacement != null)
                imputedValue = mvConstReplacement;
            else
                imputedValue = Double.toString(gcount == 0 ? 0.0 : gmean._sum);

            writeTfMtd(colID, imputedValue, outputDir, fs, agents);
        }

        if (isBinned) {
            double binwidth = (max - min) / nbins;
            writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth),
                    Integer.toString(nbins), outputDir, fs, agents);
        }

        if (isScaled) {
            try {
                if (totalValidCount != totalRecordCount) {
                    // In the presence of missing values, the variance needs to be adjusted.
                    // The mean does not need to be adjusted, when mv impute method is global_mean, 
                    // since missing values themselves are replaced with gmean.
                    long totalMissingCount = (totalRecordCount - totalValidCount);
                    int idx = isApplicable(colID);
                    if (idx != -1 && _mvMethodList[idx] == MVMethod.CONSTANT)
                        _meanFn.execute(gmean, UtilFunctions.parseToDouble(_replacementList[idx]),
                                totalRecordCount);
                    _varFn.execute(gcm, gmean._sum, totalMissingCount);
                }

                double mean = (gcount == 0 ? 0.0 : gmean._sum);
                double var = gcm.getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
                double sdev = (mapVars.size() > 0 ? Math.sqrt(var) : -1.0);

                writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);

            } catch (DMLRuntimeException e) {
                throw new IOException(e);
            }
        }
    }

    // ------------------------------------------------------------------------------------------------

    private String readReplacement(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
        Path path = new Path(txMtdDir + "/Impute/" + agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX);
        TfUtils.checkValidInputFile(fs, path, true);
        String replacement = null;
        try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)))) {
            String line = br.readLine();
            replacement = UtilFunctions.unquote(line.split(TfUtils.TXMTD_SEP)[1]);
        }

        return replacement;
    }

    public String readScaleLine(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
        Path path = new Path(txMtdDir + "/Scale/" + agents.getName(colID) + TfUtils.SCALE_FILE_SUFFIX);
        TfUtils.checkValidInputFile(fs, path, true);
        String line = null;
        try (BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)))) {
            line = br.readLine();
        }

        return line;
    }

    private void processScalingFile(int i, int[] list, KahanObject[] meanList, CM_COV_Object[] varList,
            FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException {
        int colID = list[i];

        String line = readScaleLine(colID, fs, tfMtdDir, agents);
        String[] parts = line.split(",");
        double mean = UtilFunctions.parseToDouble(parts[1]);
        double sd = UtilFunctions.parseToDouble(parts[2]);

        meanList[i]._sum = mean;
        varList[i].mean._sum = sd;
    }

    // ------------------------------------------------------------------------------------------------

    /**
     * Method to load transform metadata for all attributes
     */
    @Override
    public void loadTxMtd(JobConf job, FileSystem fs, Path tfMtdDir, TfUtils agents) throws IOException {

        if (fs.isDirectory(tfMtdDir)) {

            // Load information about missing value imputation
            if (_colList != null)
                for (int i = 0; i < _colList.length; i++) {
                    int colID = _colList[i];

                    if (_mvMethodList[i] == MVMethod.GLOBAL_MEAN || _mvMethodList[i] == MVMethod.GLOBAL_MODE)
                        // global_mean or global_mode
                        _replacementList[i] = readReplacement(colID, fs, tfMtdDir, agents);
                    else if (_mvMethodList[i] == MVMethod.CONSTANT) {
                        // constant: replace a missing value by a given constant
                        // nothing to do. The constant values are loaded already during configure 
                    } else
                        throw new RuntimeException("Invalid Missing Value Imputation methods: " + _mvMethodList[i]);
                }

            // Load scaling information
            if (_colList != null)
                for (int i = 0; i < _colList.length; i++)
                    if (_isMVScaled.get(i))
                        processScalingFile(i, _colList, _meanList, _varList, fs, tfMtdDir, agents);

            if (_scnomvList != null)
                for (int i = 0; i < _scnomvList.length; i++)
                    processScalingFile(i, _scnomvList, _scnomvMeanList, _scnomvVarList, fs, tfMtdDir, agents);
        } else {
            throw new RuntimeException("Path to recode maps must be a directory: " + tfMtdDir);
        }
    }

    public MVMethod getMethod(int colID) {
        int idx = isApplicable(colID);
        if (idx == -1)
            return MVMethod.INVALID;
        else
            return _mvMethodList[idx];
    }

    public long getNonMVCount(int colID) {
        int idx = isApplicable(colID);
        return (idx == -1) ? 0 : _countList[idx];
    }

    public String getReplacement(int colID) {
        int idx = isApplicable(colID);
        return (idx == -1) ? null : _replacementList[idx];
    }

    @Override
    public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
        build(in);
        return apply(in, out);
    }

    @Override
    public void build(FrameBlock in) {
        try {
            for (int j = 0; j < _colList.length; j++) {
                int colID = _colList[j];
                if (_mvMethodList[j] == MVMethod.GLOBAL_MEAN) {
                    //compute global column mean (scale)
                    long off = _countList[j];
                    for (int i = 0; i < in.getNumRows(); i++)
                        _meanFn.execute2(_meanList[j],
                                UtilFunctions.objectToDouble(in.getSchema()[colID - 1], in.get(i, colID - 1)),
                                off + i + 1);
                    _replacementList[j] = String.valueOf(_meanList[j]._sum);
                    _countList[j] += in.getNumRows();
                } else if (_mvMethodList[j] == MVMethod.GLOBAL_MODE) {
                    //compute global column mode (categorical), i.e., most frequent category
                    HashMap<String, Long> hist = _hist.containsKey(colID) ? _hist.get(colID)
                            : new HashMap<String, Long>();
                    for (int i = 0; i < in.getNumRows(); i++) {
                        String key = String.valueOf(in.get(i, colID - 1));
                        if (key != null && !key.isEmpty()) {
                            Long val = hist.get(key);
                            hist.put(key, (val != null) ? val + 1 : 1);
                        }
                    }
                    _hist.put(colID, hist);
                    long max = Long.MIN_VALUE;
                    for (Entry<String, Long> e : hist.entrySet())
                        if (e.getValue() > max) {
                            _replacementList[j] = e.getKey();
                            max = e.getValue();
                        }
                }
            }
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    @Override
    public String[] apply(String[] words) {
        if (isApplicable())
            for (int i = 0; i < _colList.length; i++) {
                int colID = _colList[i];
                String w = UtilFunctions.unquote(words[colID - 1]);
                if (TfUtils.isNA(_NAstrings, w))
                    w = words[colID - 1] = _replacementList[i];

                if (_isMVScaled.get(i))
                    if (_mvscMethodList[i] == MVMethod.GLOBAL_MEAN)
                        words[colID - 1] = Double.toString(UtilFunctions.parseToDouble(w) - _meanList[i]._sum);
                    else
                        words[colID - 1] = Double.toString(
                                (UtilFunctions.parseToDouble(w) - _meanList[i]._sum) / _varList[i].mean._sum);
            }

        if (_scnomvList != null)
            for (int i = 0; i < _scnomvList.length; i++) {
                int colID = _scnomvList[i];
                if (_scnomvMethodList[i] == MVMethod.GLOBAL_MEAN)
                    words[colID - 1] = Double
                            .toString(UtilFunctions.parseToDouble(words[colID - 1]) - _scnomvMeanList[i]._sum);
                else
                    words[colID - 1] = Double
                            .toString((UtilFunctions.parseToDouble(words[colID - 1]) - _scnomvMeanList[i]._sum)
                                    / _scnomvVarList[i].mean._sum);
            }

        return words;
    }

    @Override
    public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
        for (int i = 0; i < in.getNumRows(); i++) {
            for (int j = 0; j < _colList.length; j++) {
                int colID = _colList[j];
                if (Double.isNaN(out.quickGetValue(i, colID - 1)))
                    out.quickSetValue(i, colID - 1, Double.parseDouble(_replacementList[j]));
            }
        }
        return out;
    }

    @Override
    public FrameBlock getMetaData(FrameBlock out) {
        for (int j = 0; j < _colList.length; j++) {
            out.getColumnMetadata(_colList[j] - 1).setMvValue(_replacementList[j]);
        }
        return out;
    }

    public void initMetaData(FrameBlock meta) {
        //init replacement lists, replace recoded values to
        //apply mv imputation potentially after recoding
        for (int j = 0; j < _colList.length; j++) {
            int colID = _colList[j];
            String mvVal = UtilFunctions.unquote(meta.getColumnMetadata(colID - 1).getMvValue());
            if (_rcList.contains(colID)) {
                Long mvVal2 = meta.getRecodeMap(colID - 1).get(mvVal);
                if (mvVal2 == null)
                    throw new RuntimeException(
                            "Missing recode value for impute value '" + mvVal + "' (colID=" + colID + ").");
                _replacementList[j] = mvVal2.toString();
            } else {
                _replacementList[j] = mvVal;
            }
        }
    }

    public void initRecodeIDList(List<Integer> rcList) {
        _rcList = rcList;
    }

    /**
     * Exposes the internal histogram after build.
     * 
     * @param colID column ID
     * @return histogram (map of string keys and long values)
     */
    public HashMap<String, Long> getHistogram(int colID) {
        return _hist.get(colID);
    }
}