com.ibm.bi.dml.runtime.matrix.mapred.MapperBase.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.bi.dml.runtime.matrix.mapred.MapperBase.java

Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.matrix.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.instructions.mr.AggregateBinaryInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.CSVReblockInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.DataGenMRInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.MRInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.PMMJMRInstruction;
import com.ibm.bi.dml.runtime.instructions.mr.ReblockInstruction;
import com.ibm.bi.dml.runtime.matrix.data.Converter;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.MatrixValue;
import com.ibm.bi.dml.runtime.matrix.data.Pair;
import com.ibm.bi.dml.runtime.matrix.data.TaggedMatrixValue;

@SuppressWarnings("rawtypes")
public abstract class MapperBase extends MRBaseForCommonInstructions {

    protected static final Log LOG = LogFactory.getLog(MapperBase.class);

    //the indexes that this particular input matrix file represents
    protected ArrayList<Byte> representativeMatrixes = null;

    //the dimension for all the representative matrices 
    //(they are all the same, since coming from the same files)
    protected long[] rlens = null;
    protected long[] clens = null;

    //the block sizes for the representative matrices
    protected int[] brlens = null;
    protected int[] bclens = null;

    //upper boundaries to check
    protected long[] rbounds = null;
    protected long[] cbounds = null;

    //boundary block sizes
    protected int[] lastblockrlens = null;
    protected int[] lastblockclens = null;

    //rand instructions that need to be performed in mapper
    protected ArrayList<DataGenMRInstruction> dataGen_instructions = new ArrayList<DataGenMRInstruction>();

    //instructions that need to be performed in mapper
    protected ArrayList<ArrayList<MRInstruction>> mapper_instructions = new ArrayList<ArrayList<MRInstruction>>();

    //block instructions that need to be performed in part by mapper
    protected ArrayList<ArrayList<ReblockInstruction>> reblock_instructions = new ArrayList<ArrayList<ReblockInstruction>>();

    //csv block instructions that need to be performed in part by mapper
    protected ArrayList<ArrayList<CSVReblockInstruction>> csv_reblock_instructions = new ArrayList<ArrayList<CSVReblockInstruction>>();

    //the indexes of the matrices that needed to be outputted
    protected ArrayList<ArrayList<Byte>> outputIndexes = new ArrayList<ArrayList<Byte>>();

    //converter to convert the input record into indexes and matrix value (can be a cell or a block)
    protected Converter inputConverter = null;

    //a counter to measure the time spent in a mapper
    protected static enum Counters {
        MAP_TIME
    };

    @SuppressWarnings("unchecked")
    protected void commonMap(Writable rawKey, Writable rawValue, OutputCollector<Writable, Writable> out,
            Reporter reporter) throws IOException {
        long start = System.currentTimeMillis();

        //System.out.println("read in Mapper: "+rawKey+": "+rawValue);

        //for each representative matrix, read the record and apply instructions
        for (int i = 0; i < representativeMatrixes.size(); i++) {
            byte thisMatrix = representativeMatrixes.get(i);

            //convert the record into the right format for the representative matrix
            inputConverter.setBlockSize(brlens[i], bclens[i]);
            inputConverter.convert(rawKey, rawValue);

            //apply unary instructions on the converted indexes and values
            while (inputConverter.hasNext()) {
                Pair<MatrixIndexes, MatrixValue> pair = inputConverter.next();

                MatrixIndexes indexes = pair.getKey();
                MatrixValue value = pair.getValue();

                checkValidity(indexes, value, i);

                //put the input in the cache
                cachedValues.reset();
                cachedValues.set(thisMatrix, indexes, value);

                //special operations for individual mapp type
                specialOperationsForActualMap(i, out, reporter);
            }
        }

        reporter.incrCounter(Counters.MAP_TIME, System.currentTimeMillis() - start);
    }

    protected abstract void specialOperationsForActualMap(int index, OutputCollector<Writable, Writable> out,
            Reporter reporter) throws IOException;

    protected void checkValidity(MatrixIndexes indexes, MatrixValue value, int rep) throws IOException {
        if (indexes.getRowIndex() <= 0 || indexes.getColumnIndex() <= 0 || indexes.getRowIndex() > rbounds[rep]
                || indexes.getColumnIndex() > cbounds[rep]) {

            throw new IOException("key: " + indexes + " is out of range: [1, " + rbounds[rep] + "] and [1, "
                    + cbounds[rep] + "] (tag=" + rep + ")!");
        }

        if (indexes.getRowIndex() == rbounds[rep] && value.getNumRows() > lastblockrlens[rep]) {
            throw new IOException("boundary block with " + value.getNumRows() + " rows exceeds the size "
                    + lastblockrlens[rep] + " " + "(tag=" + rep + ", ix=" + indexes + ", " + value.getNumRows()
                    + "x" + value.getNumColumns() + ")");
        }

        if (indexes.getColumnIndex() == cbounds[rep] && value.getNumColumns() > lastblockclens[rep]) {
            throw new IOException("boundary block with " + value.getNumColumns() + " columns exceeds the size "
                    + lastblockclens[rep] + " " + "(tag=" + rep + ", ix=" + indexes + ", " + value.getNumRows()
                    + "x" + value.getNumColumns() + ")");
        }
    }

    /**
     * Determines if empty blocks can be discarded on map input. Conceptually, this is true
     * if the individual instruction don't need to output empty blocks and if they are sparsesafe.
     * 
     * @return
     */
    public boolean allowsFilterEmptyInputBlocks() {
        boolean ret = true;
        int count = 0;

        if (ret && mapper_instructions != null)
            for (ArrayList<MRInstruction> vinst : mapper_instructions)
                for (MRInstruction inst : vinst) {
                    ret &= (inst instanceof AggregateBinaryInstruction
                            && !((AggregateBinaryInstruction) inst).getOutputEmptyBlocks())
                            || (inst instanceof PMMJMRInstruction
                                    && !((PMMJMRInstruction) inst).getOutputEmptyBlocks());
                    count++; //ensure that mapper instructions exists
                }

        return ret && count > 0;
    }

    public void configure(JobConf job) {
        super.configure(job);

        //get the indexes that this matrix file represents, 
        //since one matrix file can occur multiple times in a statement
        try {
            representativeMatrixes = MRJobConfiguration.getInputMatrixIndexesInMapper(job);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        //get input converter information
        inputConverter = MRJobConfiguration.getInputConverter(job, representativeMatrixes.get(0));

        DataGenMRInstruction[] allDataGenIns;
        MRInstruction[] allMapperIns;
        ReblockInstruction[] allReblockIns;
        CSVReblockInstruction[] allCSVReblockIns;

        try {
            allDataGenIns = MRJobConfiguration.getDataGenInstructions(job);

            //parse the instructions on the matrices that this file represent
            allMapperIns = MRJobConfiguration.getInstructionsInMapper(job);

            //parse the reblock instructions on the matrices that this file represent
            allReblockIns = MRJobConfiguration.getReblockInstructions(job);

            allCSVReblockIns = MRJobConfiguration.getCSVReblockInstructions(job);

        } catch (DMLUnsupportedOperationException e) {
            throw new RuntimeException(e);
        } catch (DMLRuntimeException e) {
            throw new RuntimeException(e);
        }

        //get all the output indexes
        byte[] outputs = MRJobConfiguration.getOutputIndexesInMapper(job);

        //get the dimension of all the representative matrices
        rlens = new long[representativeMatrixes.size()];
        clens = new long[representativeMatrixes.size()];
        for (int i = 0; i < representativeMatrixes.size(); i++) {
            rlens[i] = MRJobConfiguration.getNumRows(job, representativeMatrixes.get(i));
            clens[i] = MRJobConfiguration.getNumColumns(job, representativeMatrixes.get(i));
            //   System.out.println("get dimension for "+representativeMatrixes.get(i)+": "+rlens[i]+", "+clens[i]);
        }

        //get the block sizes of the representative matrices
        brlens = new int[representativeMatrixes.size()];
        bclens = new int[representativeMatrixes.size()];

        for (int i = 0; i < representativeMatrixes.size(); i++) {
            brlens[i] = MRJobConfiguration.getNumRowsPerBlock(job, representativeMatrixes.get(i));
            bclens[i] = MRJobConfiguration.getNumColumnsPerBlock(job, representativeMatrixes.get(i));
            //   System.out.println("get blocksize for "+representativeMatrixes.get(i)+": "+brlens[i]+", "+bclens[i]);
        }

        rbounds = new long[representativeMatrixes.size()];
        cbounds = new long[representativeMatrixes.size()];

        lastblockrlens = new int[representativeMatrixes.size()];
        lastblockclens = new int[representativeMatrixes.size()];
        //calculate upper boundaries for key value pairs
        if (valueClass.equals(MatrixBlock.class)) {
            for (int i = 0; i < representativeMatrixes.size(); i++) {
                rbounds[i] = (long) Math.ceil((double) rlens[i] / (double) brlens[i]);
                cbounds[i] = (long) Math.ceil((double) clens[i] / (double) bclens[i]);

                lastblockrlens[i] = (int) (rlens[i] % brlens[i]);
                lastblockclens[i] = (int) (clens[i] % bclens[i]);
                if (lastblockrlens[i] == 0)
                    lastblockrlens[i] = brlens[i];
                if (lastblockclens[i] == 0)
                    lastblockclens[i] = bclens[i];

                /*
                 * what is this for????
                // DRB: the row indexes need to be fixed 
                rbounds[i] = rlens[i];*/
            }
        } else {
            for (int i = 0; i < representativeMatrixes.size(); i++) {
                rbounds[i] = rlens[i];
                cbounds[i] = clens[i];
                lastblockrlens[i] = 1;
                lastblockclens[i] = 1;
                //   System.out.println("get bound for "+representativeMatrixes.get(i)+": "+rbounds[i]+", "+cbounds[i]);
            }
        }

        //load data from distributed cache (if required, reuse if jvm_reuse)
        try {
            setupDistCacheFiles(job);
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }

        //collect unary instructions for each representative matrix
        HashSet<Byte> set = new HashSet<Byte>();
        for (int i = 0; i < representativeMatrixes.size(); i++) {
            set.clear();
            set.add(representativeMatrixes.get(i));

            //collect the relavent datagen instructions for this representative matrix
            ArrayList<DataGenMRInstruction> dataGensForThisMatrix = new ArrayList<DataGenMRInstruction>();
            if (allDataGenIns != null) {
                for (DataGenMRInstruction ins : allDataGenIns) {
                    if (set.contains(ins.getInput())) {
                        dataGensForThisMatrix.add(ins);
                        set.add(ins.output);
                    }
                }
            }
            if (dataGensForThisMatrix.size() > 1)
                throw new RuntimeException("only expects at most one rand instruction per input");
            if (dataGensForThisMatrix.isEmpty())
                dataGen_instructions.add(null);
            else
                dataGen_instructions.add(dataGensForThisMatrix.get(0));

            //collect the relavent instructions for this representative matrix
            ArrayList<MRInstruction> opsForThisMatrix = new ArrayList<MRInstruction>();

            if (allMapperIns != null) {
                for (MRInstruction ins : allMapperIns) {
                    try {
                        /*
                        boolean toAdd=true;
                        for(byte input: ins.getInputIndexes())
                           if(!set.contains(input))
                           {
                              toAdd=false;
                              break;
                           }
                           */
                        boolean toAdd = false;
                        for (byte input : ins.getInputIndexes())
                            if (set.contains(input)) {
                                toAdd = true;
                                break;
                            }

                        if (toAdd) {
                            opsForThisMatrix.add(ins);
                            set.add(ins.output);
                        }
                    } catch (DMLRuntimeException e) {
                        throw new RuntimeException(e);
                    }
                }
            }

            mapper_instructions.add(opsForThisMatrix);

            //collect the relavent reblock instructions for this representative matrix
            ArrayList<ReblockInstruction> reblocksForThisMatrix = new ArrayList<ReblockInstruction>();
            if (allReblockIns != null) {
                for (ReblockInstruction ins : allReblockIns) {
                    if (set.contains(ins.input)) {
                        reblocksForThisMatrix.add(ins);
                        set.add(ins.output);
                    }
                }
            }

            reblock_instructions.add(reblocksForThisMatrix);

            //collect the relavent reblock instructions for this representative matrix
            ArrayList<CSVReblockInstruction> csvReblocksForThisMatrix = new ArrayList<CSVReblockInstruction>();
            if (allCSVReblockIns != null) {
                for (CSVReblockInstruction ins : allCSVReblockIns) {
                    if (set.contains(ins.input)) {
                        csvReblocksForThisMatrix.add(ins);
                        set.add(ins.output);
                    }
                }
            }

            csv_reblock_instructions.add(csvReblocksForThisMatrix);

            //collect the output indexes for this representative matrix
            ArrayList<Byte> outsForThisMatrix = new ArrayList<Byte>();
            for (byte output : outputs) {
                if (set.contains(output))
                    outsForThisMatrix.add(output);
            }
            outputIndexes.add(outsForThisMatrix);
        }
    }

    protected void processMapperInstructionsForMatrix(int index) throws IOException {
        //apply all mapper instructions
        try {
            processMixedInstructions(mapper_instructions.get(index));
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    protected void processMapOutputToReducer(int index, MatrixIndexes indexBuffer,
            TaggedMatrixValue taggedValueBuffer, OutputCollector<Writable, Writable> out) throws IOException {

        for (byte output : outputIndexes.get(index)) {
            ArrayList<IndexedMatrixValue> results = cachedValues.get(output);
            if (results == null)
                continue;
            for (IndexedMatrixValue result : results) {
                if (result == null)
                    continue;
                indexBuffer.setIndexes(result.getIndexes());
                ////////////////////////////////////////
                //   taggedValueBuffer.getBaseObject().copy(result.getValue());
                taggedValueBuffer.setBaseObject(result.getValue());
                ////////////////////////////////////////
                taggedValueBuffer.setTag(output);
                out.collect(indexBuffer, taggedValueBuffer);
                //   System.out.println("map output: "+indexBuffer+"\n"+taggedValueBuffer);
            }

        }
    }
}