com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java

Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.transform;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.regex.Pattern;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONObject;

import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
import com.ibm.bi.dml.utils.JSONHelper;

@SuppressWarnings("deprecation")
public class ApplyTfHelper {

    boolean _hasHeader = false;
    String _delimString = null;
    Pattern _delim = null;
    String[] _naStrings = null;
    String _specFile = null;
    long _numCols = 0;
    JobConf _rJob = null;
    String _tmpPath = null;

    boolean _partFileWithHeader = false;

    OmitAgent _oa = null;
    MVImputeAgent _mia = null;
    RecodeAgent _ra = null;
    BinAgent _ba = null;
    DummycodeAgent _da = null;

    long _numTransformedRows;
    long _numTransformedColumns;

    public ApplyTfHelper(JobConf job) throws IllegalArgumentException, IOException {
        _hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));

        _delimString = job.get(MRJobConfiguration.TF_DELIM);
        _delim = Pattern.compile(Pattern.quote(_delimString));

        _naStrings = DataTransform.parseNAStrings(job);

        _numCols = UtilFunctions.parseToLong(job.get(MRJobConfiguration.TF_NUM_COLS)); // #of columns in input data
        _tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);

        _specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);

        _numTransformedRows = 0;
        _numTransformedColumns = 0;

        _rJob = job;
    }

    public JSONObject parseSpec() throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(_specFile))));
        JSONObject obj = JSONHelper.parse(br);
        br.close();
        return obj;
    }

    public void setupTfAgents(JSONObject spec) {
        // Set up transformation agents
        TransformationAgent.init(_naStrings, _rJob.get(MRJobConfiguration.TF_HEADER), _delimString);
        _oa = new OmitAgent(spec);
        _mia = new MVImputeAgent(spec);
        _ra = new RecodeAgent(spec);
        _ba = new BinAgent(spec);
        _da = new DummycodeAgent(spec, _numCols);
    }

    public void loadTfMetadata(JSONObject spec) throws IOException {
        Path txMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0];
        FileSystem localFS = FileSystem.getLocal(_rJob);

        // load transformation metadata 
        _mia.loadTxMtd(_rJob, localFS, txMtdDir);
        _ra.loadTxMtd(_rJob, localFS, txMtdDir);
        _ba.loadTxMtd(_rJob, localFS, txMtdDir);

        // associate recode maps and bin definitions with dummycoding agent,
        // as recoded and binned columns are typically dummycoded
        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
        _da.loadTxMtd(_rJob, localFS, txMtdDir);

        FileSystem fs;
        fs = FileSystem.get(_rJob);
        Path thisPath = new Path(_rJob.get("map.input.file")).makeQualified(fs);
        String thisfile = thisPath.toString();

        Path smallestFilePath = new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
        if (thisfile.toString().equals(smallestFilePath.toString()))
            _partFileWithHeader = true;
        else
            _partFileWithHeader = false;

    }

    public long processHeaderLine(Text rawValue) throws IOException {
        String header = null;
        header = _rJob.get(MRJobConfiguration.TF_HEADER);

        String dcdHeader = _da.constructDummycodedHeader(header, _delimString);
        _da.genDcdMapsAndColTypes(FileSystem.get(_rJob), _tmpPath, (int) _numCols, _ra, _ba);

        // write header information (before and after transformation) to temporary path
        // these files are copied into txMtdPath, once the ApplyTf job is complete.
        DataTransform.generateHeaderFiles(FileSystem.get(_rJob), _tmpPath, header, dcdHeader);

        _numTransformedColumns = _delim.split(dcdHeader, -1).length;
        return _numTransformedColumns;
    }

    public String[] getWords(Text line) {
        return _delim.split(line.toString(), -1);
    }

    public boolean omit(String[] words) {
        return _oa.omit(words);
    }

    public String[] apply(String[] words) {
        words = _mia.apply(words);
        words = _ra.apply(words);
        words = _ba.apply(words);
        words = _da.apply(words);
        _numTransformedRows++;
        return words;
    }

    public long getNumTransformedRows() {
        return _numTransformedRows;
    }

    public long getNumTransformedColumns() {
        return _numTransformedColumns;
    }

    public static void check(String[] words, DummycodeAgent da) throws DMLRuntimeException {
        boolean checkEmptyString = (TransformationAgent.NAstrings != null);
        if (checkEmptyString) {
            final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
            for (int i = 0; i < words.length; i++)
                if (words[i] != null && words[i].equals(""))
                    throw new DMLRuntimeException(msg + da.mapDcdColumnID(i + 1));
        }
    }

    public String checkAndPrepOutputString(String[] words, DummycodeAgent da) throws DMLRuntimeException {
        return checkAndPrepOutputString(words, new StringBuilder(), _delimString, da);
    }

    public static String checkAndPrepOutputString(String[] words, StringBuilder sb, String delim, DummycodeAgent da)
            throws DMLRuntimeException {
        /*
         * Check if empty strings ("") have to be handled.
         * 
         * Unless na.strings are provided, empty strings are (implicitly) considered as value zero.
         * When na.strings are provided, then "" is considered a missing value indicator, and the 
         * user is expected to provide an appropriate imputation method. Therefore, when na.strings 
         * are provided, "" encountered in any column (after all transformations are applied) 
         * denotes an erroneous condition.  
         */
        boolean checkEmptyString = (TransformationAgent.NAstrings != null); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) {

        //StringBuilder sb = new StringBuilder();
        sb.setLength(0);
        int i = 0;

        if (checkEmptyString) {
            final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
            if (words[0] != null)
                if (words[0].equals(""))
                    throw new DMLRuntimeException(msg + da.mapDcdColumnID(1));
                else
                    sb.append(words[0]);
            else
                sb.append("0");

            for (i = 1; i < words.length; i++) {
                sb.append(delim);

                if (words[i] != null)
                    if (words[i].equals(""))
                        throw new DMLRuntimeException(msg + da.mapDcdColumnID(i + 1));
                    else
                        sb.append(words[i]);
                else
                    sb.append("0");
            }
        } else {
            sb.append(words[0] != null ? words[0] : "0");
            for (i = 1; i < words.length; i++) {
                sb.append(delim);
                sb.append(words[i] != null ? words[i] : "0");
            }
        }

        return sb.toString();
    }

}