com.ibm.bi.dml.runtime.transform.TfUtils.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.transform.TfUtils.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.transform;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.regex.Pattern;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.parser.DataExpression;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR.OffsetCount;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
import com.ibm.bi.dml.utils.JSONHelper;

@SuppressWarnings("deprecation")
public class TfUtils implements Serializable {

    private static final long serialVersionUID = 526252850872633125L;

    private OmitAgent _oa = null;
    private MVImputeAgent _mia = null;
    private RecodeAgent _ra = null;
    private BinAgent _ba = null;
    private DummycodeAgent _da = null;

    private long _numRecordsInPartFile; // Total number of records in the data file
    private long _numValidRecords; // (_numRecordsInPartFile - #of omitted records)
    private long _numTransformedRows; // Number of rows after applying transformations
    private long _numTransformedColumns; // Number of columns after applying transformations

    private String _headerLine = null;
    private boolean _hasHeader;
    private Pattern _delim = null;
    private String _delimString = null;
    private String[] _NAstrings = null;
    private String[] _outputColumnNames = null;
    private long _numInputCols = -1;

    private String _tfMtdDir = null;
    private String _specFile = null;
    private String _offsetFile = null;
    private String _tmpDir = null;
    private String _outputPath = null;

    protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err) throws IOException {
        // check non-existing file
        if (!fs.exists(path))
            if (err)
                throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
            else
                return false;

        // check for empty file
        if (MapReduceTool.isFileEmpty(fs, path.toString()))
            if (err)
                throw new EOFException("Empty input file " + path.toString() + ".");
            else
                return false;

        return true;
    }

    public static String getPartFileName(JobConf job) throws IOException {
        FileSystem fs = FileSystem.get(job);
        Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs);
        return thisPath.toString();
    }

    public static boolean isPartFileWithHeader(JobConf job) throws IOException {
        FileSystem fs = FileSystem.get(job);

        String thisfile = getPartFileName(job);
        Path smallestFilePath = new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);

        if (thisfile.toString().equals(smallestFilePath.toString()))
            return true;
        else
            return false;
    }

    public static JSONObject readSpec(FileSystem fs, String specFile) throws IOException {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFile))));
        JSONObject obj = JSONHelper.parse(br);
        br.close();
        return obj;
    }

    /**
     * Prepare NA strings so that they can be sent to workers via JobConf.
     * A "dummy" string is added at the end to handle the case of empty strings.
     * @param na
     * @return
     */
    public static String prepNAStrings(String na) {
        return na + DataExpression.DELIM_NA_STRING_SEP + "dummy";
    }

    public static String[] parseNAStrings(String na) {
        if (na == null)
            return null;

        String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1);
        return tmp; //Arrays.copyOf(tmp, tmp.length-1);
    }

    public static String[] parseNAStrings(JobConf job) {
        return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS));
    }

    private void createAgents(JSONObject spec) throws IOException, JSONException {
        _oa = new OmitAgent(spec);
        _mia = new MVImputeAgent(spec);
        _ra = new RecodeAgent(spec);
        _ba = new BinAgent(spec);
        _da = new DummycodeAgent(spec, _numInputCols);
    }

    public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da) {
        _oa = oa;
        _mia = mia;
        _ra = ra;
        _ba = ba;
        _da = da;
    }

    private void parseColumnNames() {
        _outputColumnNames = _delim.split(_headerLine, -1);
        for (int i = 0; i < _outputColumnNames.length; i++)
            _outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]);
    }

    private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec,
            long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException {
        _numRecordsInPartFile = 0;
        _numValidRecords = 0;
        _numTransformedRows = 0;
        _numTransformedColumns = 0;

        _headerLine = headerLine;
        _hasHeader = hasHeader;
        _delimString = delim;
        _delim = Pattern.compile(Pattern.quote(delim));
        _NAstrings = naStrings;
        _numInputCols = numCols;
        _offsetFile = offsetFile;
        _tmpDir = tmpPath;
        _outputPath = outputPath;

        parseColumnNames();
        createAgents(spec);
    }

    public TfUtils(JobConf job, boolean minimal) throws IOException, JSONException {
        if (!InfrastructureAnalyzer.isLocalMode(job)) {
            ConfigurationManager.setCachedJobConf(job);
        }

        _NAstrings = TfUtils.parseNAStrings(job);
        _specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);

        FileSystem fs = FileSystem.get(job);
        JSONObject spec = TfUtils.readSpec(fs, _specFile);

        _oa = new OmitAgent(spec);
    }

    // called from GenTFMtdMapper, ApplyTf (Hadoop)
    public TfUtils(JobConf job) throws IOException, JSONException {
        if (!InfrastructureAnalyzer.isLocalMode(job)) {
            ConfigurationManager.setCachedJobConf(job);
        }

        boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));
        //Pattern delim = Pattern.compile(Pattern.quote(job.get(MRJobConfiguration.TF_DELIM)));
        String[] naStrings = TfUtils.parseNAStrings(job);

        long numCols = UtilFunctions.parseToLong(job.get(MRJobConfiguration.TF_NUM_COLS)); // #of columns in input data

        String specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);
        String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE);
        String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);
        String outputPath = FileOutputFormat.getOutputPath(job).toString();
        FileSystem fs = FileSystem.get(job);
        JSONObject spec = TfUtils.readSpec(fs, specFile);

        init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings,
                spec, numCols, offsetFile, tmpPath, outputPath);
    }

    // called from GenTfMtdReducer 
    public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException {
        this(job);
        _tfMtdDir = tfMtdDir;
    }

    // called from GenTFMtdReducer and ApplyTf (Spark)
    public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec,
            long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException {
        init(headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null);
        _tfMtdDir = tfMtdDir;
    }

    public void incrValid() {
        _numValidRecords++;
    }

    public long getValid() {
        return _numValidRecords;
    }

    public long getTotal() {
        return _numRecordsInPartFile;
    }

    public long getNumTransformedRows() {
        return _numTransformedRows;
    }

    public long getNumTransformedColumns() {
        return _numTransformedColumns;
    }

    public String getHeader() {
        return _headerLine;
    }

    public boolean hasHeader() {
        return _hasHeader;
    }

    public String getDelimString() {
        return _delimString;
    }

    public Pattern getDelim() {
        return _delim;
    }

    public String[] getNAStrings() {
        return _NAstrings;
    }

    public long getNumCols() {
        return _numInputCols;
    }

    public String getSpecFile() {
        return _specFile;
    }

    public String getTfMtdDir() {
        return _tfMtdDir;
    }

    public String getOffsetFile() {
        return _offsetFile;
    }

    public String getTmpDir() {
        return _tmpDir;
    }

    public String getOutputPath() {
        return _outputPath;
    }

    public String getName(int colID) {
        return _outputColumnNames[colID - 1];
    }

    public void setValid(long n) {
        _numValidRecords = n;
    }

    public void incrTotal() {
        _numRecordsInPartFile++;
    }

    public void setTotal(long n) {
        _numRecordsInPartFile = n;
    }

    public OmitAgent getOmitAgent() {
        return _oa;
    }

    public MVImputeAgent getMVImputeAgent() {
        return _mia;
    }

    public RecodeAgent getRecodeAgent() {
        return _ra;
    }

    public BinAgent getBinAgent() {
        return _ba;
    }

    public DummycodeAgent getDummycodeAgent() {
        return _da;
    }

    /**
     * Function that checks if the given string is one of NA strings.
     * 
     * @param w
     * @return
     */
    public boolean isNA(String w) {
        if (_NAstrings == null)
            return false;

        for (String na : _NAstrings) {
            if (w.equals(na))
                return true;
        }
        return false;
    }

    public String[] getWords(Text line) {
        return getWords(line.toString());
    }

    public String[] getWords(String line) {
        return getDelim().split(line.trim(), -1);
    }

    /**
     * Process a given row to construct transformation metadata.
     * 
     * @param line
     * @return
     * @throws IOException
     */
    public String[] prepareTfMtd(String line) throws IOException {
        String[] words = getWords(line);
        if (!getOmitAgent().omit(words, this)) {
            getMVImputeAgent().prepare(words, this);
            getRecodeAgent().prepare(words, this);
            getBinAgent().prepare(words, this);
            incrValid();
            ;
        }
        incrTotal();

        return words;
    }

    public void loadTfMetadata() throws IOException {
        JobConf job = ConfigurationManager.getCachedJobConf();
        loadTfMetadata(job, false);
    }

    public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException {
        Path tfMtdDir = null;
        FileSystem fs = null;

        if (fromLocalFS) {
            // metadata must be read from local file system (e.g., distributed cache in the case of Hadoop)
            tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0];
            fs = FileSystem.getLocal(job);
        } else {
            fs = FileSystem.get(job);
            tfMtdDir = new Path(getTfMtdDir());
        }

        // load transformation metadata 
        getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this);
        getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
        getBinAgent().loadTxMtd(job, fs, tfMtdDir, this);

        // associate recode maps and bin definitions with dummycoding agent,
        // as recoded and binned columns are typically dummycoded
        getDummycodeAgent().setRecodeMaps(getRecodeAgent().getRecodeMaps());
        getDummycodeAgent().setNumBins(getBinAgent().getBinList(), getBinAgent().getNumBins());
        getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this);

    }

    /*public void loadTfMetadata () throws IOException
    {
       Path tfMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0];
       FileSystem localFS = FileSystem.getLocal(_rJob);
           
       loadTfMetadata(_rJob, localFS, tfMtdDir);
           
       FileSystem fs;
       fs = FileSystem.get(_rJob);
       Path thisPath=new Path(_rJob.get("map.input.file")).makeQualified(fs);
       String thisfile=thisPath.toString();
         
       Path smallestFilePath=new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
       if(thisfile.toString().equals(smallestFilePath.toString()))
     _partFileWithHeader=true;
       else
     _partFileWithHeader = false;
    }*/

    public String processHeaderLine() throws IOException {
        FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
        String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim());
        getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this);

        // write header information (before and after transformation) to temporary path
        // these files are copied into txMtdPath, once the ApplyTf job is complete.
        DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader);

        return dcdHeader;
        //_numTransformedColumns = getDelim().split(dcdHeader, -1).length; 
        //return _numTransformedColumns;
    }

    public boolean omit(String[] words) {
        if (getOmitAgent() == null)
            return false;
        return getOmitAgent().omit(words, this);
    }

    public String[] apply(String[] words) {
        return apply(words, false);
    }

    /**
     * Function to apply transformation metadata on a given row.
     * 
     * @param words
     * @param optimizeMaps
     * @return
     */
    public String[] apply(String[] words, boolean optimizeMaps) {
        words = getMVImputeAgent().apply(words, this);

        if (optimizeMaps)
            // specific case of transform() invoked from CP (to save boxing and unboxing)
            words = getRecodeAgent().cp_apply(words, this);
        else
            words = getRecodeAgent().apply(words, this);

        words = getBinAgent().apply(words, this);
        words = getDummycodeAgent().apply(words, this);

        _numTransformedRows++;

        return words;
    }

    public void check(String[] words) throws DMLRuntimeException {
        boolean checkEmptyString = (getNAStrings() != null);
        if (checkEmptyString) {
            final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
            for (int i = 0; i < words.length; i++)
                if (words[i] != null && words[i].equals(""))
                    throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i + 1));
        }
    }

    public String checkAndPrepOutputString(String[] words) throws DMLRuntimeException {
        return checkAndPrepOutputString(words, new StringBuilder());
    }

    public String checkAndPrepOutputString(String[] words, StringBuilder sb) throws DMLRuntimeException {
        /*
         * Check if empty strings ("") have to be handled.
         * 
         * Unless na.strings are provided, empty strings are (implicitly) considered as value zero.
         * When na.strings are provided, then "" is considered a missing value indicator, and the 
         * user is expected to provide an appropriate imputation method. Therefore, when na.strings 
         * are provided, "" encountered in any column (after all transformations are applied) 
         * denotes an erroneous condition.  
         */
        boolean checkEmptyString = (getNAStrings() != null); //&& !MVImputeAgent.isNA("", TransformationAgent.NAstrings) ) {

        //StringBuilder sb = new StringBuilder();
        sb.setLength(0);
        int i = 0;

        if (checkEmptyString) {
            final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
            if (words[0] != null)
                if (words[0].equals(""))
                    throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(1));
                else
                    sb.append(words[0]);
            else
                sb.append("0");

            for (i = 1; i < words.length; i++) {
                sb.append(_delimString);

                if (words[i] != null)
                    if (words[i].equals(""))
                        throw new DMLRuntimeException(msg + getDummycodeAgent().mapDcdColumnID(i + 1));
                    else
                        sb.append(words[i]);
                else
                    sb.append("0");
            }
        } else {
            sb.append(words[0] != null ? words[0] : "0");
            for (i = 1; i < words.length; i++) {
                sb.append(_delimString);
                sb.append(words[i] != null ? words[i] : "0");
            }
        }

        return sb.toString();
    }

    private Reader initOffsetsReader(JobConf job) throws IOException {
        Path path = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        FileSystem fs = FileSystem.get(job);
        Path[] files = MatrixReader.getSequenceFilePaths(fs, path);
        if (files.length != 1)
            throw new IOException("Expecting a single file under counters file: " + path.toString());

        Reader reader = new SequenceFile.Reader(fs, files[0], job);

        return reader;
    }

    /**
     * Function to generate custom file names (transform-part-.....) for
     * mappers' output for ApplyTfCSV job. The idea is to find the index 
     * of (thisfile, fileoffset) in the list of all offsets from the 
     * counters/offsets file, which was generated from either GenTfMtdMR
     * or AssignRowIDMR job.
     * 
     */
    public String getPartFileID(JobConf job, long offset) throws IOException {
        Reader reader = initOffsetsReader(job);

        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        String thisFile = TfUtils.getPartFileName(job);

        int id = 0;
        while (reader.next(key, value)) {
            if (thisFile.equals(value.filename) && value.fileOffset == offset)
                break;
            id++;
        }
        reader.close();

        String sid = Integer.toString(id);
        char[] carr = new char[5 - sid.length()];
        Arrays.fill(carr, '0');
        String ret = (new String(carr)).concat(sid);

        return ret;
    }
}