com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.controlprogram.parfor;

import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;

import com.ibm.bi.dml.api.DMLScript;
import com.ibm.bi.dml.parser.Expression.DataType;
import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.controlprogram.ParForProgramBlock;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.parfor.util.StagingFileUtils;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.MatrixFormatMetaData;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixCell;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.data.TaggedMatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.TaggedMatrixCell;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.LocalFileUtils;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
import com.ibm.bi.dml.utils.Statistics;

/**
 * MR job class for submitting parfor result merge MR jobs.
 * 
 */
public class ResultMergeRemoteMR extends ResultMerge {

    public static final byte COMPARE_TAG = 'c';
    public static final byte DATA_TAG = 'd';

    private long _pfid = -1;
    private int _numMappers = -1;
    private int _numReducers = -1;
    private int _replication = -1;
    //private int  _max_retry = -1;
    private boolean _jvmReuse = false;

    public ResultMergeRemoteMR(MatrixObject out, MatrixObject[] in, String outputFilename, long pfid,
            int numMappers, int numReducers, int replication, int max_retry, boolean jvmReuse) {
        super(out, in, outputFilename);

        _pfid = pfid;
        _numMappers = numMappers;
        _numReducers = numReducers;
        _replication = replication;
        //_max_retry = max_retry;
        _jvmReuse = jvmReuse;
    }

    @Override
    public MatrixObject executeSerialMerge() throws DMLRuntimeException {
        //graceful degradation to parallel merge
        return executeParallelMerge(_numMappers);
    }

    @Override
    public MatrixObject executeParallelMerge(int par) throws DMLRuntimeException {
        MatrixObject moNew = null; //always create new matrix object (required for nested parallelism)

        //Timing time = null;
        LOG.trace("ResultMerge (remote, mr): Execute serial merge for output " + _output.getVarName() + " (fname="
                + _output.getFileName() + ")");
        //   time = new Timing();
        //   time.start();

        try {
            //collect all relevant inputs
            Collection<String> srcFnames = new LinkedList<String>();
            ArrayList<MatrixObject> inMO = new ArrayList<MatrixObject>();
            for (MatrixObject in : _inputs) {
                //check for empty inputs (no iterations executed)
                if (in != null && in != _output) {
                    //ensure that input file resides on disk
                    in.exportData();

                    //add to merge list
                    srcFnames.add(in.getFileName());
                    inMO.add(in);
                }
            }

            if (!srcFnames.isEmpty()) {
                //ensure that outputfile (for comparison) resides on disk
                _output.exportData();

                //actual merge
                MatrixFormatMetaData metadata = (MatrixFormatMetaData) _output.getMetaData();
                MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();

                String fnameCompare = _output.getFileName();
                if (mcOld.getNonZeros() == 0)
                    fnameCompare = null; //no compare required

                executeMerge(fnameCompare, _outputFName, srcFnames.toArray(new String[0]), metadata.getInputInfo(),
                        metadata.getOutputInfo(), mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(),
                        mcOld.getColsPerBlock());

                //create new output matrix (e.g., to prevent potential export<->read file access conflict
                String varName = _output.getVarName();
                ValueType vt = _output.getValueType();
                moNew = new MatrixObject(vt, _outputFName);
                moNew.setVarName(varName.contains(NAME_SUFFIX) ? varName : varName + NAME_SUFFIX);
                moNew.setDataType(DataType.MATRIX);
                OutputInfo oiOld = metadata.getOutputInfo();
                InputInfo iiOld = metadata.getInputInfo();
                MatrixCharacteristics mc = new MatrixCharacteristics(mcOld.getRows(), mcOld.getCols(),
                        mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
                mc.setNonZeros(computeNonZeros(_output, inMO));
                MatrixFormatMetaData meta = new MatrixFormatMetaData(mc, oiOld, iiOld);
                moNew.setMetaData(meta);
            } else {
                moNew = _output; //return old matrix, to prevent copy
            }
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }

        //LOG.trace("ResultMerge (local, file): Executed serial merge for output "+_output.getVarName()+" (fname="+_output.getFileName()+") in "+time.stop()+"ms");

        return moNew;
    }

    /**
     * 
     * @param fname    null if no comparison required
     * @param fnameNew
     * @param srcFnames
     * @param ii
     * @param oi
     * @param rlen
     * @param clen
     * @param brlen
     * @param bclen
     * @throws DMLRuntimeException
     */
    @SuppressWarnings({ "unused", "deprecation" })
    protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
            long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
        String jobname = "ParFor-RMMR";
        long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

        JobConf job;
        job = new JobConf(ResultMergeRemoteMR.class);
        job.setJobName(jobname + _pfid);

        //maintain dml script counters
        Statistics.incrementNoOfCompiledMRJobs();

        //warning for textcell/binarycell without compare
        boolean withCompare = (fname != null);
        if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
                && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
            LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                    + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

        try {
            Path pathCompare = null;
            Path pathNew = new Path(fnameNew);

            /////
            //configure the MR job
            if (withCompare) {
                pathCompare = new Path(fname).makeQualified(FileSystem.get(job));
                MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                        LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                        bclen);
            } else
                MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                        LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                        bclen);

            //set mappers, reducers, combiners
            job.setMapperClass(ResultMergeRemoteMapper.class);
            job.setReducerClass(ResultMergeRemoteReducer.class);

            if (oi == OutputInfo.TextCellOutputInfo) {
                job.setMapOutputKeyClass(MatrixIndexes.class);
                job.setMapOutputValueClass(TaggedMatrixCell.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(Text.class);
            } else if (oi == OutputInfo.BinaryCellOutputInfo) {
                job.setMapOutputKeyClass(MatrixIndexes.class);
                job.setMapOutputValueClass(TaggedMatrixCell.class);
                job.setOutputKeyClass(MatrixIndexes.class);
                job.setOutputValueClass(MatrixCell.class);
            } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
                //setup partitioning, grouping, sorting for composite key (old API)
                job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
                job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
                job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

                job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
                job.setMapOutputValueClass(TaggedMatrixBlock.class);
                job.setOutputKeyClass(MatrixIndexes.class);
                job.setOutputValueClass(MatrixBlock.class);
            }

            //set input format 
            job.setInputFormat(ii.inputFormatClass);

            //set the input path 
            Path[] paths = null;
            if (withCompare) {
                paths = new Path[srcFnames.length + 1];
                paths[0] = pathCompare;
                for (int i = 1; i < paths.length; i++)
                    paths[i] = new Path(srcFnames[i - 1]);
            } else {
                paths = new Path[srcFnames.length];
                for (int i = 0; i < paths.length; i++)
                    paths[i] = new Path(srcFnames[i]);
            }
            FileInputFormat.setInputPaths(job, paths);

            //set output format
            job.setOutputFormat(oi.outputFormatClass);

            //set output path
            MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
            FileOutputFormat.setOutputPath(job, pathNew);

            //////
            //set optimization parameters

            //set the number of mappers and reducers 
            //job.setNumMapTasks( _numMappers ); //use default num mappers
            long reducerGroups = _numReducers;
            if (oi == OutputInfo.BinaryBlockOutputInfo)
                reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
            else //textcell/binarycell
                reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
            job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

            //use FLEX scheduler configuration properties
            if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
                job.setInt("flex.map.min", 0);
                job.setInt("flex.map.max", _numMappers);
                job.setInt("flex.reduce.min", 0);
                job.setInt("flex.reduce.max", _numMappers);
            }

            //disable automatic tasks timeouts and speculative task exec
            job.setInt("mapred.task.timeout", 0);
            job.setMapSpeculativeExecution(false);

            //set up preferred custom serialization framework for binary block format
            if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
                MRJobConfiguration.addBinaryBlockSerializationFramework(job);

            //enables the reuse of JVMs (multiple tasks per MR task)
            if (_jvmReuse)
                job.setNumTasksToExecutePerJvm(-1); //unlimited

            //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
            //job.set("mapred.compress.map.output", "true");
            //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

            //set the replication factor for the results
            job.setInt("dfs.replication", _replication);

            //set the max number of retries per map task
            //  disabled job-level configuration to respect cluster configuration
            //  note: this refers to hadoop2, hence it never had effect on mr1
            //job.setInt("mapreduce.map.maxattempts", _max_retry);

            //set unique working dir
            MRJobConfiguration.setUniqueWorkingDir(job);

            /////
            // execute the MR job   

            JobClient.runJob(job);

            //maintain dml script counters
            Statistics.incrementNoOfExecutedMRJobs();
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }

        if (DMLScript.STATISTICS) {
            long t1 = System.nanoTime();
            Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
        }
    }

}