com.ibm.bi.dml.runtime.transform.DataTransform.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.transform.DataTransform.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.transform;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

import scala.Tuple2;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.conf.DMLConfig;
import com.ibm.bi.dml.lops.CSVReBlock;
import com.ibm.bi.dml.lops.Lop;
import com.ibm.bi.dml.lops.LopProperties.ExecType;
import com.ibm.bi.dml.parser.Expression.DataType;
import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.parser.ParameterizedBuiltinFunctionExpression;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.context.ExecutionContext;
import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext;
import com.ibm.bi.dml.runtime.instructions.Instruction;
import com.ibm.bi.dml.runtime.instructions.InstructionParser;
import com.ibm.bi.dml.runtime.instructions.MRJobInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.CSVReblockInstruction;
import com.ibm.bi.dml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction;
import com.ibm.bi.dml.runtime.instructions.spark.data.RDDObject;
import com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtils;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn;
import com.ibm.bi.dml.runtime.matrix.JobReturn;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.transform.TransformationAgent.TX_METHOD;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
import com.ibm.bi.dml.utils.JSONHelper;

public class DataTransform {

    /**
     * Method to read the header line from the input data file.
     * 
     * @param fs
     * @param prop
     * @param smallestFile
     * @return
     * @throws IOException
     */
    private static String readHeaderLine(FileSystem fs, CSVFileFormatProperties prop, String smallestFile)
            throws IOException {
        String line = null;

        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(smallestFile))));
        line = br.readLine();
        br.close();
        if (prop.hasHeader()) {
            ; // nothing here
        } else {
            // construct header with default column names, V1, V2, etc.
            int ncol = Pattern.compile(Pattern.quote(prop.getDelim())).split(line, -1).length;
            line = null;

            StringBuilder sb = new StringBuilder();
            sb.append("V1");
            for (int i = 2; i <= ncol; i++)
                sb.append(prop.getDelim() + "V" + i);
            line = sb.toString();
        }
        return line;
    }

    /**
     * Method to construct a mapping between column names and their
     * corresponding column IDs. The mapping is used to prepare the
     * specification file in <code>processSpecFile()</code>.
     * 
     * @param fs
     * @param prop
     * @param headerLine
     * @param smallestFile
     * @return
     * @throws IllegalArgumentException
     * @throws IOException
     */
    private static HashMap<String, Integer> processColumnNames(FileSystem fs, CSVFileFormatProperties prop,
            String headerLine, String smallestFile) throws IllegalArgumentException, IOException {
        HashMap<String, Integer> colNames = new HashMap<String, Integer>();

        String escapedDelim = Pattern.quote(prop.getDelim());
        Pattern compiledDelim = Pattern.compile(escapedDelim);
        String[] names = compiledDelim.split(headerLine, -1);

        for (int i = 0; i < names.length; i++)
            colNames.put(UtilFunctions.unquote(names[i].trim()), i + 1);

        return colNames;
    }

    /**
     * In-place permutation of list, mthd, and cst arrays based on indices,
     * by navigating through cycles in the permutation. 
     * 
     * @param list
     * @param mthd
     * @param cst
     * @param indices
     */
    private static void inplacePermute(int[] list, byte[] mthd, Object[] cst, Integer[] indices) {
        int x;
        byte xb = 0;
        Object xo = null;

        int j, k;
        for (int i = 0; i < list.length; i++) {
            x = list[i];
            xb = mthd[i];
            if (cst != null)
                xo = cst[i];

            j = i;
            while (true) {
                k = indices[j];
                indices[j] = j;

                if (k == i)
                    break;

                list[j] = list[k];
                mthd[j] = mthd[k];
                if (cst != null)
                    cst[j] = cst[k];
                j = k;
            }
            list[j] = x;
            mthd[j] = xb;
            if (cst != null)
                cst[j] = xo;
        }

    }

    /**
     * Convert input transformation specification file with column names into a
     * specification with corresponding column Ids. This file is sent to all the
     * relevant MR jobs.
     * 
     * @param fs
     * @param inputPath
     * @param smallestFile
     * @param colNames
     * @param prop
     * @param specFileWithNames
     * @return
     * @throws IllegalArgumentException
     * @throws IOException
     * @throws JSONException 
     */
    private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile,
            HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specFileWithNames)
            throws IllegalArgumentException, IOException, JSONException {
        // load input spec file with Names
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFileWithNames))));
        JSONObject inputSpec = JSONHelper.parse(br);
        br.close();

        final String NAME = "name";
        final String ID = "id";
        final String METHOD = "method";
        final String VALUE = "value";
        final String MV_METHOD_MEAN = "global_mean";
        final String MV_METHOD_MODE = "global_mode";
        final String MV_METHOD_CONSTANT = "constant";
        final String BIN_METHOD_WIDTH = "equi-width";
        final String BIN_METHOD_HEIGHT = "equi-height";
        final String SCALE_METHOD_Z = "z-score";
        final String SCALE_METHOD_M = "mean-subtraction";
        final String JSON_BYPOS = "ids";

        String stmp = null;
        JSONObject entry = null;
        byte btmp = 0;

        final int[] mvList;
        int[] rcdList, dcdList, omitList;
        final int[] binList;
        final int[] scaleList;
        byte[] mvMethods = null, binMethods = null, scaleMethods = null;
        Object[] numBins = null;
        Object[] mvConstants = null;

        boolean byPositions = (inputSpec.containsKey(JSON_BYPOS)
                && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true);

        // --------------------------------------------------------------------------
        // Omit
        if (inputSpec.containsKey(TX_METHOD.OMIT.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.OMIT.toString());
            omitList = new int[arrtmp.size()];
            for (int i = 0; i < arrtmp.size(); i++) {
                if (byPositions)
                    omitList[i] = UtilFunctions.toInt(arrtmp.get(i));
                else {
                    stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                    omitList[i] = colNames.get(stmp);
                }
            }
            Arrays.sort(omitList);
        } else
            omitList = null;
        // --------------------------------------------------------------------------
        // Missing value imputation
        if (inputSpec.containsKey(TX_METHOD.IMPUTE.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.IMPUTE.toString());

            mvList = new int[arrtmp.size()];
            mvMethods = new byte[arrtmp.size()];
            mvConstants = new Object[arrtmp.size()];

            for (int i = 0; i < arrtmp.size(); i++) {
                entry = (JSONObject) arrtmp.get(i);
                if (byPositions) {
                    mvList[i] = UtilFunctions.toInt(entry.get(ID));
                } else {
                    stmp = UtilFunctions.unquote((String) entry.get(NAME));
                    mvList[i] = colNames.get(stmp);
                }

                stmp = UtilFunctions.unquote((String) entry.get(METHOD));
                if (stmp.equals(MV_METHOD_MEAN))
                    btmp = (byte) 1;
                else if (stmp.equals(MV_METHOD_MODE))
                    btmp = (byte) 2;
                else if (stmp.equals(MV_METHOD_CONSTANT))
                    btmp = (byte) 3;
                else
                    throw new IOException("Unknown missing value imputation method (" + stmp
                            + ") in transformation specification file: " + specFileWithNames);
                mvMethods[i] = btmp;

                //txMethods.add( btmp );

                mvConstants[i] = null;
                if (entry.containsKey(VALUE))
                    mvConstants[i] = entry.get(VALUE);
            }

            Integer[] idx = new Integer[mvList.length];
            for (int i = 0; i < mvList.length; i++)
                idx[i] = i;
            Arrays.sort(idx, new Comparator<Integer>() {
                @Override
                public int compare(Integer o1, Integer o2) {
                    return (mvList[o1] - mvList[o2]);
                }
            });

            // rearrange mvList, mvMethods, and mvConstants according to permutation idx
            inplacePermute(mvList, mvMethods, mvConstants, idx);
        } else
            mvList = null;
        // --------------------------------------------------------------------------
        // Recoding
        if (inputSpec.containsKey(TX_METHOD.RECODE.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.RECODE.toString());
            rcdList = new int[arrtmp.size()];
            for (int i = 0; i < arrtmp.size(); i++) {
                if (byPositions)
                    rcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
                else {
                    stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                    rcdList[i] = colNames.get(stmp);
                }
            }
            Arrays.sort(rcdList);
        } else
            rcdList = null;
        // --------------------------------------------------------------------------
        // Binning
        if (inputSpec.containsKey(TX_METHOD.BIN.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.BIN.toString());

            binList = new int[arrtmp.size()];
            binMethods = new byte[arrtmp.size()];
            numBins = new Object[arrtmp.size()];

            for (int i = 0; i < arrtmp.size(); i++) {
                entry = (JSONObject) arrtmp.get(i);

                if (byPositions) {
                    binList[i] = UtilFunctions.toInt(entry.get(ID));
                } else {
                    stmp = UtilFunctions.unquote((String) entry.get(NAME));
                    binList[i] = colNames.get(stmp);
                }
                stmp = UtilFunctions.unquote((String) entry.get(METHOD));
                if (stmp.equals(BIN_METHOD_WIDTH))
                    btmp = (byte) 1;
                else if (stmp.equals(BIN_METHOD_HEIGHT))
                    throw new IOException(
                            "Equi-height binning method is not yet supported, in transformation specification file: "
                                    + specFileWithNames);
                else
                    throw new IOException("Unknown missing value imputation method (" + stmp
                            + ") in transformation specification file: " + specFileWithNames);
                binMethods[i] = btmp;

                numBins[i] = entry.get(TransformationAgent.JSON_NBINS);
                if (((Integer) numBins[i]).intValue() <= 1)
                    throw new IllegalArgumentException("Invalid transformation on column \""
                            + (String) entry.get(NAME) + "\". Number of bins must be greater than 1.");
            }

            Integer[] idx = new Integer[binList.length];
            for (int i = 0; i < binList.length; i++)
                idx[i] = i;
            Arrays.sort(idx, new Comparator<Integer>() {
                @Override
                public int compare(Integer o1, Integer o2) {
                    return (binList[o1] - binList[o2]);
                }
            });

            // rearrange binList and binMethods according to permutation idx
            inplacePermute(binList, binMethods, numBins, idx);
        } else
            binList = null;
        // --------------------------------------------------------------------------
        // Dummycoding
        if (inputSpec.containsKey(TX_METHOD.DUMMYCODE.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.DUMMYCODE.toString());
            dcdList = new int[arrtmp.size()];
            for (int i = 0; i < arrtmp.size(); i++) {
                if (byPositions)
                    dcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
                else {
                    stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                    dcdList[i] = colNames.get(stmp);
                }
            }
            Arrays.sort(dcdList);
        } else
            dcdList = null;
        // --------------------------------------------------------------------------
        // Scaling
        if (inputSpec.containsKey(TX_METHOD.SCALE.toString())) {
            JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.SCALE.toString());

            scaleList = new int[arrtmp.size()];
            scaleMethods = new byte[arrtmp.size()];

            for (int i = 0; i < arrtmp.size(); i++) {
                entry = (JSONObject) arrtmp.get(i);

                if (byPositions) {
                    scaleList[i] = UtilFunctions.toInt(entry.get(ID));
                } else {
                    stmp = UtilFunctions.unquote((String) entry.get(NAME));
                    scaleList[i] = colNames.get(stmp);
                }
                stmp = UtilFunctions.unquote((String) entry.get(METHOD));
                if (stmp.equals(SCALE_METHOD_M))
                    btmp = (byte) 1;
                else if (stmp.equals(SCALE_METHOD_Z))
                    btmp = (byte) 2;
                else
                    throw new IOException("Unknown missing value imputation method (" + stmp
                            + ") in transformation specification file: " + specFileWithNames);
                scaleMethods[i] = btmp;
            }

            Integer[] idx = new Integer[scaleList.length];
            for (int i = 0; i < scaleList.length; i++)
                idx[i] = i;
            Arrays.sort(idx, new Comparator<Integer>() {
                @Override
                public int compare(Integer o1, Integer o2) {
                    return (scaleList[o1] - scaleList[o2]);
                }
            });

            // rearrange scaleList and scaleMethods according to permutation idx
            inplacePermute(scaleList, scaleMethods, null, idx);
        } else
            scaleList = null;
        // --------------------------------------------------------------------------

        // check for column IDs that are imputed with mode, but not recoded
        // These columns have be handled separately, because the computation of mode 
        // requires the computation of distinct values (i.e., recode maps)
        ArrayList<Integer> tmpList = new ArrayList<Integer>();
        if (mvList != null)
            for (int i = 0; i < mvList.length; i++) {
                int colID = mvList[i];
                if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0))
                    tmpList.add(colID);
            }

        int[] mvrcdList = null;
        if (tmpList.size() > 0) {
            mvrcdList = new int[tmpList.size()];
            for (int i = 0; i < tmpList.size(); i++)
                mvrcdList[i] = tmpList.get(i);
        }
        // Perform Validity Checks

        /*
           OMIT MVI RCD BIN DCD SCL
           OMIT     -  x   *   *   *   *
           MVI      x  -   *   *   *   *
           RCD      *  *   -   x   *   x
           BIN      *  *   x   -   *   x
           DCD      *  *   *   *   -   x
           SCL      *  *   x   x   x   -
         */

        if (mvList != null)
            for (int i = 0; i < mvList.length; i++) {
                int colID = mvList[i];

                if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A column can not be both omitted and imputed.");

                if (mvMethods[i] == 1) {
                    if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                        throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                                + ". A numeric column can not be recoded.");

                    if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                        // throw an error only if the column is not binned
                        if (binList == null || Arrays.binarySearch(binList, colID) < 0)
                            throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                                    + ". A numeric column can not be dummycoded.");
                }
            }

        if (scaleList != null)
            for (int i = 0; i < scaleList.length; i++) {
                int colID = scaleList[i];
                if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A column can not be recoded and scaled.");
                if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A column can not be binned and scaled.");
                if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A column can not be dummycoded and scaled.");
            }

        if (rcdList != null)
            for (int i = 0; i < rcdList.length; i++) {
                int colID = rcdList[i];
                if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A column can not be recoded and binned.");
            }

        // Check if dummycoded columns are either recoded or binned.
        // If not, add them to recode list.
        ArrayList<Integer> addToRcd = new ArrayList<Integer>();
        if (dcdList != null)
            for (int i = 0; i < dcdList.length; i++) {
                int colID = dcdList[i];
                boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
                boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
                // If colID is neither recoded nor binned, then, add it to rcdList.
                if (!isRecoded && !isBinned)
                    addToRcd.add(colID);
            }
        if (addToRcd.size() > 0) {
            int[] newRcdList = null;
            if (rcdList != null)
                newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
            else
                newRcdList = new int[addToRcd.size()];

            int i = (rcdList != null ? rcdList.length : 0);
            for (int idx = 0; i < newRcdList.length; i++, idx++)
                newRcdList[i] = addToRcd.get(idx);
            Arrays.sort(newRcdList);
            rcdList = newRcdList;
        }
        // -----------------------------------------------------------------------------

        // Prepare output spec
        JSONObject outputSpec = new JSONObject();

        if (omitList != null) {
            JSONObject rcdSpec = new JSONObject();
            rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(omitList));
            outputSpec.put(TX_METHOD.OMIT.toString(), rcdSpec);
        }

        if (mvList != null) {
            JSONObject mvSpec = new JSONObject();
            mvSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvList));
            mvSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(mvMethods));
            mvSpec.put(TransformationAgent.JSON_CONSTS, toJSONArray(mvConstants));
            outputSpec.put(TX_METHOD.IMPUTE.toString(), mvSpec);
        }

        if (rcdList != null) {
            JSONObject rcdSpec = new JSONObject();
            rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(rcdList));
            outputSpec.put(TX_METHOD.RECODE.toString(), rcdSpec);
        }

        if (binList != null) {
            JSONObject binSpec = new JSONObject();
            binSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(binList));
            binSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(binMethods));
            binSpec.put(TransformationAgent.JSON_NBINS, toJSONArray(numBins));
            outputSpec.put(TX_METHOD.BIN.toString(), binSpec);
        }

        if (dcdList != null) {
            JSONObject dcdSpec = new JSONObject();
            dcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(dcdList));
            outputSpec.put(TX_METHOD.DUMMYCODE.toString(), dcdSpec);
        }

        if (scaleList != null) {
            JSONObject scaleSpec = new JSONObject();
            scaleSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(scaleList));
            scaleSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(scaleMethods));
            outputSpec.put(TX_METHOD.SCALE.toString(), scaleSpec);
        }

        if (mvrcdList != null) {
            JSONObject mvrcd = new JSONObject();
            mvrcd.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvrcdList));
            outputSpec.put(TX_METHOD.MVRCD.toString(), mvrcd);
        }

        // write out the spec with IDs
        String specFileWithIDs = MRJobConfiguration.constructTempOutputFilename();
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFileWithIDs), true)));
        out.write(outputSpec.toString());
        out.close();

        return specFileWithIDs;
    }

    private static JSONArray toJSONArray(int[] list) {
        JSONArray ret = new JSONArray(list.length);
        for (int i = 0; i < list.length; i++)
            ret.add(list[i]);
        return ret;
    }

    private static JSONArray toJSONArray(byte[] list) {
        JSONArray ret = new JSONArray(list.length);
        for (int i = 0; i < list.length; i++)
            ret.add(list[i]);
        return ret;
    }

    private static JSONArray toJSONArray(Object[] list) {
        JSONArray ret = new JSONArray(list.length);
        for (int i = 0; i < list.length; i++)
            ret.add(list[i]);
        return ret;
    }

    private static final String ERROR_MSG_ZERO_ROWS = "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.";

    /**
     * Private class to hold the relevant input parameters to transform operation.
     */
    private static class TransformOperands {
        String inputPath = null, txMtdPath = null, applyTxPath = null, specFile = null, outNamesFile = null;
        boolean isApply = false;
        CSVFileFormatProperties inputCSVProperties = null;

        TransformOperands(String inst, MatrixObject inputMatrix) {
            String[] instParts = inst.split(Instruction.OPERAND_DELIM);

            inputPath = inputMatrix.getFileName();
            txMtdPath = instParts[3];

            isApply = Boolean.parseBoolean(instParts[5]);
            if (isApply) {
                applyTxPath = instParts[4];
            } else {
                specFile = instParts[4];
            }

            if (instParts.length == 8)
                outNamesFile = instParts[6];

            inputCSVProperties = (CSVFileFormatProperties) inputMatrix.getFileFormatProperties();
        }

        TransformOperands(ParameterizedBuiltinSPInstruction inst, MatrixObject inputMatrix) {
            HashMap<String, String> params = inst.getParams();

            inputPath = inputMatrix.getFileName();
            txMtdPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXMTD);

            specFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXSPEC);
            applyTxPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_APPLYMTD);
            isApply = (applyTxPath != null);
            outNamesFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_OUTNAMES); // can be null

            inputCSVProperties = (CSVFileFormatProperties) inputMatrix.getFileFormatProperties();
        }

    }

    /**
     * Helper function to move transformation metadata files from a temporary
     * location to permanent location. These files (e.g., header before and
     * after transformation) are generated by a single mapper, while applying
     * data transformations. Note that, these files must be ultimately be placed
     * under the existing metadata directory (txMtdPath), which is
     * simultaneously read by other mappers. If they are not created at a
     * temporary location, then MR tasks fail due to changing timestamps on
     * txMtdPath.
     * 
     * @param fs
     * @param tmpPath
     * @param txMtdPath
     * @throws IllegalArgumentException
     * @throws IOException
     */
    private static void moveFilesFromTmp(FileSystem fs, String tmpPath, String txMtdPath)
            throws IllegalArgumentException, IOException {
        // move files from temporary location to txMtdPath
        MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_HEADER,
                txMtdPath + "/" + TransformationAgent.OUT_HEADER);
        MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_DCD_HEADER,
                txMtdPath + "/" + TransformationAgent.OUT_DCD_HEADER);
        MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.COLTYPES_FILE_NAME,
                txMtdPath + "/" + TransformationAgent.COLTYPES_FILE_NAME);

        if (fs.exists(new Path(tmpPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME))) {
            if (!fs.exists(new Path(txMtdPath + "/Dummycode/")))
                fs.mkdirs(new Path(txMtdPath + "/Dummycode/"));
            MapReduceTool.renameFileOnHDFS(tmpPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME,
                    txMtdPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME);
        }
    }

    /**
     * Helper function to determine the number of columns after applying
     * transformations. Note that dummycoding changes the number of columns.
     * 
     * @param fs
     * @param header
     * @param delim
     * @param tfMtdPath
     * @return
     * @throws IllegalArgumentException
     * @throws IOException
     * @throws DMLRuntimeException
     * @throws JSONException 
     */
    private static int getNumColumnsTf(FileSystem fs, String header, String delim, String tfMtdPath)
            throws IllegalArgumentException, IOException, DMLRuntimeException, JSONException {
        String[] columnNames = Pattern.compile(Pattern.quote(delim)).split(header, -1);
        int ret = columnNames.length;

        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(tfMtdPath + "/spec.json"))));
        JSONObject spec = JSONHelper.parse(br);
        br.close();

        // fetch relevant attribute lists
        if (!spec.containsKey(TX_METHOD.DUMMYCODE.toString()))
            return ret;

        JSONArray dcdList = (JSONArray) ((JSONObject) spec.get(TX_METHOD.DUMMYCODE.toString()))
                .get(TransformationAgent.JSON_ATTRS);

        // look for numBins among binned columns
        for (Object o : dcdList) {
            int id = UtilFunctions.toInt(o);

            Path binpath = new Path(tfMtdPath + "/Bin/" + UtilFunctions.unquote(columnNames[id - 1])
                    + TransformationAgent.BIN_FILE_SUFFIX);
            Path rcdpath = new Path(tfMtdPath + "/Recode/" + UtilFunctions.unquote(columnNames[id - 1])
                    + TransformationAgent.NDISTINCT_FILE_SUFFIX);

            if (TfUtils.checkValidInputFile(fs, binpath, false)) {
                br = new BufferedReader(new InputStreamReader(fs.open(binpath)));
                int nbins = UtilFunctions.parseToInt(br.readLine().split(TransformationAgent.TXMTD_SEP)[4]);
                br.close();
                ret += (nbins - 1);
            } else if (TfUtils.checkValidInputFile(fs, rcdpath, false)) {
                br = new BufferedReader(new InputStreamReader(fs.open(rcdpath)));
                int ndistinct = UtilFunctions.parseToInt(br.readLine());
                br.close();
                ret += (ndistinct - 1);
            } else
                throw new DMLRuntimeException("Relevant transformation metadata for column (id=" + id + ", name="
                        + columnNames[id - 1] + ") is not found.");
        }
        //System.out.println("Number of columns in transformed data: " + ret);
        return ret;
    }

    /**
     * Main method to create and/or apply transformation metdata using MapReduce.
     * 
     * @param jobinst
     * @param inputMatrices
     * @param shuffleInst
     * @param otherInst
     * @param resultIndices
     * @param outputMatrices
     * @param numReducers
     * @param replication
     * @return
     * @throws Exception
     */
    public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputMatrices,
            String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputMatrices,
            int numReducers, int replication) throws Exception {

        String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);

        // Parse transform instruction (the first instruction) to obtain relevant fields
        TransformOperands oprnds = new TransformOperands(insts[0], inputMatrices[0]);

        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);

        // find the first file in alphabetical ordering of partfiles in directory inputPath 
        String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);

        // find column names
        String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
        HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine,
                smallestFile);
        String outHeader = getOutputHeader(fs, headerLine, oprnds);
        int numColumns = colNamesToIds.size();

        int numColumnsTf = 0;
        long numRowsTf = 0;

        ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
        ArrayList<Integer> bboutputs = new ArrayList<Integer>();

        // divide output objects based on output format (CSV or BinaryBlock)
        for (int i = 0; i < outputMatrices.length; i++) {
            if (outputMatrices[i].getFileFormatProperties() != null && outputMatrices[i].getFileFormatProperties()
                    .getFileFormat() == FileFormatProperties.FileFormat.CSV)
                csvoutputs.add(i);
            else
                bboutputs.add(i);
        }
        boolean isCSV = (csvoutputs.size() > 0);
        boolean isBB = (bboutputs.size() > 0);
        String tmpPath = MRJobConfiguration.constructTempOutputFilename();

        JobReturn retCSV = null, retBB = null;

        if (!oprnds.isApply) {
            // build specification file with column IDs insteadof column names
            String specFileWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds,
                    oprnds.inputCSVProperties, oprnds.specFile);
            colNamesToIds = null; // enable GC on colNamesToIds

            // Build transformation metadata, including recode maps, bin definitions, etc.
            // Also, generate part offsets file (counters file), which is to be used in csv-reblock

            String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
            numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specFileWithIDs, smallestFile,
                    partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);

            if (numRowsTf == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

            // store the specFileWithIDs as transformation metadata
            MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");

            numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);

            // Apply transformation metadata, and perform actual transformation 
            if (isCSV)
                retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specFileWithIDs, oprnds.txMtdPath, tmpPath,
                        outputMatrices[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties,
                        numColumns, replication, outHeader);

            if (isBB) {
                DMLConfig conf = ConfigurationManager.getConfig();
                int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
                CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);

                AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath },
                        new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize },
                        new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true,
                        oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
                if (ret1.rlens[0] == 0)
                    throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

                retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specFileWithIDs, oprnds.txMtdPath,
                        tmpPath, outputMatrices[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(),
                        oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
            }

            MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);

        } else {
            colNamesToIds = null; // enable GC on colNamesToIds

            // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
            MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);

            // path to specification file
            String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";
            numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);

            if (isCSV) {
                DMLConfig conf = ConfigurationManager.getConfig();
                int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
                CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);

                AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath },
                        new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize },
                        new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true,
                        oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
                numRowsTf = ret1.rlens[0];

                if (ret1.rlens[0] == 0)
                    throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

                // Apply transformation metadata, and perform actual transformation 
                retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specFileWithIDs, oprnds.applyTxPath, tmpPath,
                        outputMatrices[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(),
                        oprnds.inputCSVProperties, numColumns, replication, outHeader);
            }

            if (isBB) {
                // compute part offsets file
                CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser
                        .parseSingleInstruction(insts[1]);
                CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
                AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath },
                        new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen },
                        new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile },
                        true, oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
                numRowsTf = ret1.rlens[0];

                if (ret1.rlens[0] == 0)
                    throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

                // apply transformation metadata, as well as reblock the resulting data
                retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specFileWithIDs, oprnds.txMtdPath,
                        tmpPath, outputMatrices[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(),
                        oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication,
                        outHeader);
            }
        }

        // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
        moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);

        // generate matrix metadata file for outputs
        if (retCSV != null) {
            retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);

            CSVFileFormatProperties prop = new CSVFileFormatProperties(false, oprnds.inputCSVProperties.getDelim(), // use the same header as the input
                    false, Double.NaN, null);

            MapReduceTool.writeMetaDataFile(outputMatrices[csvoutputs.get(0)].getFileName() + ".mtd",
                    ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
            return retCSV;
        }

        if (retBB != null) {
            retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);

            MapReduceTool.writeMetaDataFile(outputMatrices[bboutputs.get(0)].getFileName() + ".mtd",
                    ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
            return retBB;
        }

        return null;

    }

    private static CSVReblockInstruction prepDummyReblockInstruction(CSVFileFormatProperties prop, int blockSize) {
        StringBuilder sb = new StringBuilder();
        sb.append(ExecType.MR);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(CSVReBlock.OPCODE);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append("0");
        sb.append(Lop.DATATYPE_PREFIX);
        sb.append(DataType.MATRIX);
        sb.append(Lop.VALUETYPE_PREFIX);
        sb.append(ValueType.DOUBLE);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append("1");
        sb.append(Lop.DATATYPE_PREFIX);
        sb.append(DataType.MATRIX);
        sb.append(Lop.VALUETYPE_PREFIX);
        sb.append(ValueType.DOUBLE);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(blockSize);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(blockSize);

        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(prop.hasHeader());
        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(prop.getDelim());
        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(prop.isFill());
        sb.append(Lop.OPERAND_DELIMITOR);
        sb.append(prop.getFillValue());

        return (CSVReblockInstruction) CSVReblockInstruction.parseInstruction(sb.toString());
    }

    private static String getOutputHeader(FileSystem fs, String headerLine, TransformOperands oprnds)
            throws IOException {
        String ret = null;

        if (oprnds.isApply) {
            BufferedReader br = new BufferedReader(new InputStreamReader(
                    fs.open(new Path(oprnds.applyTxPath + "/" + TransformationAgent.OUT_HEADER))));
            ret = br.readLine();
            br.close();
        } else {
            if (oprnds.outNamesFile == null)
                ret = headerLine;
            else {
                BufferedReader br = new BufferedReader(
                        new InputStreamReader(fs.open(new Path(oprnds.outNamesFile))));
                ret = br.readLine();
                br.close();
            }
        }

        return ret;
    }

    /**
     * Main method to create and/or apply transformation metdata in-memory, on a
     * single node.
     * 
     * @param inst
     * @param inputMatrices
     * @param outputMatrices
     * @return
     * @throws IOException
     * @throws DMLRuntimeException 
     * @throws JSONException 
     * @throws IllegalArgumentException 
     */
    public static JobReturn cpDataTransform(String inst, MatrixObject[] inputMatrices,
            MatrixObject[] outputMatrices)
            throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
        String[] insts = inst.split(Instruction.INSTRUCTION_DELIM);

        // Parse transform instruction (the first instruction) to obtain relevant fields
        TransformOperands oprnds = new TransformOperands(insts[0], inputMatrices[0]);

        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        // find the first file in alphabetical ordering of partfiles in directory inputPath 
        String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);

        // find column names
        String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
        HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine,
                smallestFile);
        String outHeader = getOutputHeader(fs, headerLine, oprnds);

        ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
        ArrayList<Integer> bboutputs = new ArrayList<Integer>();

        // divide output objects based on output format (CSV or BinaryBlock)
        for (int i = 0; i < outputMatrices.length; i++) {
            if (outputMatrices[i].getFileFormatProperties() != null && outputMatrices[i].getFileFormatProperties()
                    .getFileFormat() == FileFormatProperties.FileFormat.CSV)
                csvoutputs.add(i);
            else
                bboutputs.add(i);
        }
        boolean isCSV = (csvoutputs.size() > 0);
        boolean isBB = (bboutputs.size() > 0);

        JobReturn ret = null;

        if (!oprnds.isApply) {
            // build specification file with column IDs insteadof column names
            String specFileWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds,
                    oprnds.inputCSVProperties, oprnds.specFile);
            MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");

            ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties,
                    specFileWithIDs, oprnds.txMtdPath, oprnds.isApply, outputMatrices[0], outHeader, isBB, isCSV);
        } else {
            // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
            MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);

            // path to specification file
            String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";

            ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties,
                    specFileWithIDs, oprnds.txMtdPath, oprnds.isApply, outputMatrices[0], outHeader, isBB, isCSV);
        }

        return ret;
    }

    /**
     * Helper function to fetch and sort the list of part files under the given
     * input directory.
     * 
     * @param input
     * @param fs
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    @SuppressWarnings("unchecked")
    private static ArrayList<Path> collectInputFiles(String input, FileSystem fs)
            throws FileNotFoundException, IOException {
        Path path = new Path(input);
        ArrayList<Path> files = new ArrayList<Path>();
        if (fs.isDirectory(path)) {
            for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
                files.add(stat.getPath());
            Collections.sort(files);
        } else
            files.add(path);

        return files;
    }

    private static int[] countNumRows(ArrayList<Path> files, CSVFileFormatProperties prop, FileSystem fs,
            TfUtils agents) throws IOException {
        int[] rows = new int[2];
        int numRows = 0, numRowsTf = 0;

        OmitAgent oa = agents.getOmitAgent();

        if (!oa.isApplicable()) {
            for (int fileNo = 0; fileNo < files.size(); fileNo++) {
                BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
                if (fileNo == 0 && prop.hasHeader())
                    br.readLine(); //ignore header

                while (br.readLine() != null)
                    numRows++;
                br.close();
            }
            numRowsTf = numRows;
        } else {
            String line = null;
            String[] words;

            Pattern delim = Pattern.compile(Pattern.quote(prop.getDelim()));

            for (int fileNo = 0; fileNo < files.size(); fileNo++) {
                BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
                if (fileNo == 0 && prop.hasHeader())
                    br.readLine(); //ignore header

                while ((line = br.readLine()) != null) {
                    numRows++;

                    words = delim.split(line, -1);
                    if (!oa.omit(words, agents))
                        numRowsTf++;
                }
                br.close();
            }
        }

        rows[0] = numRows;
        rows[1] = numRowsTf;

        return rows;
    }

    /**
     * Main method to create and/or apply transformation metdata in-memory, on a single node.
     * 
     * @param job
     * @param fs
     * @param inputPath
     * @param ncols
     * @param prop
     * @param specFileWithIDs
     * @param tfMtdPath
     * @param applyTxPath
     * @param isApply
     * @param outputPath
     * @param headerLine
     * @throws IOException
     * @throws DMLRuntimeException 
     * @throws JSONException 
     * @throws IllegalArgumentException 
     */
    private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols,
            CSVFileFormatProperties prop, String specFileWithIDs, String tfMtdPath, boolean isApply,
            MatrixObject result, String headerLine, boolean isBB, boolean isCSV)
            throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {

        String[] na = TfUtils.parseNAStrings(prop.getNAStrings());

        JSONObject spec = TfUtils.readSpec(fs, specFileWithIDs);
        TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath,
                null, null);

        MVImputeAgent _mia = agents.getMVImputeAgent();
        RecodeAgent _ra = agents.getRecodeAgent();
        BinAgent _ba = agents.getBinAgent();
        DummycodeAgent _da = agents.getDummycodeAgent();

        // List of files to read
        ArrayList<Path> files = collectInputFiles(inputPath, fs);

        // ---------------------------------
        // Construct transformation metadata
        // ---------------------------------

        String line = null;
        String[] words = null;

        int numColumnsTf = 0;
        BufferedReader br = null;

        if (!isApply) {
            for (int fileNo = 0; fileNo < files.size(); fileNo++) {
                br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
                if (fileNo == 0 && prop.hasHeader())
                    br.readLine(); //ignore header

                line = null;
                while ((line = br.readLine()) != null) {
                    agents.prepareTfMtd(line);
                }
                br.close();
            }

            if (agents.getValid() == 0)
                throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

            _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
            _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
            _ra.outputTransformationMetadata(tfMtdPath, fs, agents);

            // prepare agents for the subsequent phase of applying transformation metadata

            // NO need to loadTxMtd for _ra, since the maps are already present in the memory
            Path tmp = new Path(tfMtdPath);
            _mia.loadTxMtd(job, fs, tmp, agents);
            _ba.loadTxMtd(job, fs, tmp, agents);

            _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
            _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
            _da.loadTxMtd(job, fs, tmp, agents);
        } else {
            // Count the number of rows
            int rows[] = countNumRows(files, prop, fs, agents);
            agents.setTotal(rows[0]);
            agents.setValid(rows[1]);

            if (agents.getValid() == 0)
                throw new DMLRuntimeException(
                        "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");

            // Load transformation metadata
            // prepare agents for the subsequent phase of applying transformation metadata
            Path tmp = new Path(tfMtdPath);
            _mia.loadTxMtd(job, fs, tmp, agents);
            _ra.loadTxMtd(job, fs, tmp, agents);
            _ba.loadTxMtd(job, fs, tmp, agents);

            _da.setRecodeMaps(_ra.getRecodeMaps());
            _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
            _da.loadTxMtd(job, fs, tmp, agents);
        }

        // -----------------------------
        // Apply transformation metadata
        // -----------------------------

        numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);

        MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
        BufferedWriter out = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
        StringBuilder sb = new StringBuilder();

        MatrixBlock mb = null;
        if (isBB) {
            int estNNZ = (int) agents.getValid() * ncols;
            mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);

            if (mb.isInSparseFormat())
                mb.allocateSparseRowsBlock();
            else
                mb.allocateDenseBlock();
        }

        int rowID = 0; // rowid to be used in filling the matrix block

        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0) {
                String header = null;
                if (prop.hasHeader())
                    br.readLine(); // ignore the header line from data file

                header = headerLine;
                String dcdHeader = _da.constructDummycodedHeader(header, agents.getDelim());
                numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
                DataTransform.generateHeaderFiles(fs, tfMtdPath, header, dcdHeader);
            }

            line = null;
            while ((line = br.readLine()) != null) {
                words = agents.getWords(line);

                if (!agents.omit(words)) {
                    words = agents.apply(words, !isApply);

                    if (isCSV) {
                        out.write(agents.checkAndPrepOutputString(words, sb));
                        out.write("\n");
                    }

                    if (isBB) {
                        agents.check(words);
                        for (int c = 0; c < words.length; c++) {
                            if (words[c] == null || words[c].isEmpty())
                                ;
                            else
                                mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                        }
                    }
                    rowID++;
                }
            }
            br.close();
        }
        out.close();

        if (mb != null) {
            mb.recomputeNonZeros();
            mb.examSparsity();

            result.acquireModify(mb);
            result.release();
            result.exportData();
        }

        MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf,
                (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
        JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);

        return ret;
    }

    public static void generateHeaderFiles(FileSystem fs, String txMtdDir, String origHeader, String newHeader)
            throws IOException {
        // write out given header line
        Path pt = new Path(txMtdDir + "/" + TransformationAgent.OUT_HEADER);
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        br.write(origHeader + "\n");
        br.close();

        // write out the new header line (after all transformations)
        pt = new Path(txMtdDir + "/" + TransformationAgent.OUT_DCD_HEADER);
        br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        br.write(newHeader + "\n");
        br.close();
    }

    public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, MatrixObject[] inputMatrices,
            MatrixObject[] outputMatrices, ExecutionContext ec) throws Exception {

        SparkExecutionContext sec = (SparkExecutionContext) ec;

        // Parse transform instruction (the first instruction) to obtain relevant fields
        TransformOperands oprnds = new TransformOperands(inst, inputMatrices[0]);

        JobConf job = new JobConf();
        FileSystem fs = FileSystem.get(job);

        // find the first file in alphabetical ordering of partfiles in directory inputPath 
        String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);

        // find column names and construct output header
        String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
        HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine,
                smallestFile);
        int numColumns = colNamesToIds.size();
        String outHeader = getOutputHeader(fs, headerLine, oprnds);

        String tmpPath = MRJobConfiguration.constructTempOutputFilename();

        // Construct RDD for input data
        @SuppressWarnings("unchecked")
        JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec
                .getRDDHandleForMatrixObject(inputMatrices[0], InputInfo.CSVInputInfo);
        JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();

        long numRowsTf = 0, numColumnsTf = 0;
        JavaPairRDD<Long, String> tfPairRDD = null;

        if (!oprnds.isApply) {
            // build specification file with column IDs insteadof column names
            String specFileWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds,
                    oprnds.inputCSVProperties, oprnds.specFile);
            colNamesToIds = null; // enable GC on colNamesToIds

            // Build transformation metadata, including recode maps, bin definitions, etc.
            // Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
            String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
            numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specFileWithIDs, partOffsetsFile,
                    oprnds.inputCSVProperties, numColumns, outHeader);

            // store the specFileWithIDs as transformation metadata
            MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");

            numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);

            tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specFileWithIDs, tmpPath,
                    oprnds.inputCSVProperties, numColumns, outHeader);

            MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
        } else {
            colNamesToIds = null; // enable GC on colNamesToIds

            // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
            MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);

            // path to specification file
            String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";
            numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);

            // Apply transformation metadata, and perform actual transformation 
            tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specFileWithIDs, tmpPath,
                    oprnds.inputCSVProperties, numColumns, outHeader);

        }

        // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
        moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);

        // convert to csv output format (serialized longwritable/text)
        JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);

        if (outtfPairRDD != null) {
            MatrixObject outMO = outputMatrices[0];
            String outVar = outMO.getVarName();
            outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
            sec.addLineageRDD(outVar, inst.getParams().get("target"));

            //update output statistics (required for correctness)
            MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
            mcOut.setDimension(numRowsTf, numColumnsTf);
            mcOut.setNonZeros(-1);
        }
    }
}