org.apache.sysml.runtime.matrix.data.LibMatrixDNN.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.sysml.runtime.matrix.data.LibMatrixDNN.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.sysml.runtime.matrix.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;

/**
 * This class allows users to invoke deep learning related operations 
 * (such as conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, maxpooling_backward, bias_add)
 * using multiple threads.
 * 
 * The methods accept the input matrices as MatrixBlock and the parameters using ConvolutionParameters.
 * 
 * To run in single thread, please set ConvolutionParameters.numThreads to 1.
 */
public class LibMatrixDNN {

    protected static final Log LOG = LogFactory.getLog(LibMatrixDNN.class.getName());
    // ------------------------------------------------------------------------------------------------
    // Useful flags for performance testing:
    private static boolean DISPLAY_STATISTICS = false;
    private static final boolean ALLOW_MULTI_THREADED_OPS = true;
    // ------------------------------------------------------------------------------------------------

    enum TaskType {
        MaxPooling_Forward, MaxPooling_Backward,
        // Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
        LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData, BiasAdd, ReluBackward
    }

    // ------------------------------------------------------------------------------------------------
    private static AtomicLong conv2dSparseCount = new AtomicLong(0);
    private static AtomicLong conv2dDenseCount = new AtomicLong(0);
    private static AtomicLong conv2dBwdFilterSparseCount = new AtomicLong(0);
    private static AtomicLong conv2dBwdFilterDenseCount = new AtomicLong(0);
    private static AtomicLong conv2dBwdDataSparseCount = new AtomicLong(0);
    private static AtomicLong conv2dBwdDataDenseCount = new AtomicLong(0);
    private static AtomicLong im2colSparseCount = new AtomicLong(0);
    private static AtomicLong im2colDenseCount = new AtomicLong(0);
    private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0);
    private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0);
    private static AtomicLong loopedConvMatMultTime = new AtomicLong(0);
    private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0);
    private static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0);
    private static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0);
    private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0);
    private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);

    public static void appendStatistics(StringBuilder sb) {
        if (DMLScript.STATISTICS && DISPLAY_STATISTICS
                && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
            sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" + conv2dDenseCount.get() + "/"
                    + conv2dBwdFilterDenseCount.get() + "/" + conv2dBwdDataDenseCount.get() + "/"
                    + im2colDenseCount.get() + "/" + maxPoolBwdDenseCount.get() + ".\n");
            sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t" + conv2dSparseCount.get() + "/"
                    + conv2dBwdFilterSparseCount.get() + "/" + conv2dBwdDataSparseCount.get() + "/"
                    + im2colSparseCount.get() + "/" + maxPoolBwdSparseCount.get() + ".\n");
            if (loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
                sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t"
                        + String.format("%.3f", loopedConvIm2ColTime.get() * 1e-9) + "/"
                        + String.format("%.3f", loopedConvMatMultTime.get() * 1e-9) + "/"
                        + String.format("%.3f", loopedConvBwdFilterIm2ColTime.get() * 1e-9) + "/"
                        + String.format("%.3f", loopedConvBwdFilterMatMultTime.get() * 1e-9) + "/"
                        + String.format("%.3f", loopedConvBwdDataCol2ImTime.get() * 1e-9) + "/"
                        + String.format("%.3f", loopedConvBwdDataMatMultTime.get() * 1e-9) + " sec.\n");
            }
        }
    }

    public static void resetStatistics() {
        conv2dDenseCount.set(0);
        conv2dBwdFilterDenseCount.set(0);
        conv2dBwdDataDenseCount.set(0);
        im2colDenseCount.set(0);
        maxPoolBwdDenseCount.set(0);

        conv2dSparseCount.set(0);
        conv2dBwdFilterSparseCount.set(0);
        conv2dBwdDataSparseCount.set(0);
        im2colSparseCount.set(0);
        maxPoolBwdSparseCount.set(0);

        loopedConvIm2ColTime.set(0);
        loopedConvMatMultTime.set(0);
        loopedConvBwdFilterMatMultTime.set(0);
        loopedConvBwdFilterIm2ColTime.set(0);
        loopedConvBwdDataMatMultTime.set(0);
        loopedConvBwdDataCol2ImTime.set(0);
    }
    // ------------------------------------------------------------------------------------------------

    /**
     * This method computes the backpropogation errors for previous layer of convolution operation
     * 
     * @param filter filter used in conv2d 
     * @param dout errors from next layer
     * @param outputBlock  output errors
     * @param params convolution parameters
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock,
            ConvolutionParameters params) throws DMLRuntimeException {
        params.input1 = filter;
        params.input2 = dout;
        params.output = outputBlock;
        if (filter.getNumRows() != params.K || filter.getNumColumns() != params.C * params.R * params.S
                || dout.getNumRows() != params.N || dout.getNumColumns() != params.K * params.P * params.Q) {
            throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
        }
        if (params.stride_h <= 0 || params.stride_w <= 0) {
            throw new DMLRuntimeException("Only positive strides supported");
        }

        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            if (filter.isInSparseFormat() || dout.isInSparseFormat()) {
                conv2dBwdDataSparseCount.addAndGet(1);
            } else {
                conv2dBwdDataDenseCount.addAndGet(1);
            }
        }

        runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
    }

    /**
     * This method computes the backpropogation errors for filter of convolution operation
     * 
     * @param input input image 
     * @param dout errors from next layer
     * @param outputBlock  output errors
     * @param params convolution parameters
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock,
            ConvolutionParameters params) throws DMLRuntimeException {
        params.input1 = input;
        params.input2 = dout;
        params.output = outputBlock;
        if (input.getNumRows() != params.N || input.getNumColumns() != params.C * params.H * params.W
                || dout.getNumRows() != params.N || dout.getNumColumns() != params.K * params.P * params.Q) {
            throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
        }
        if (params.stride_h <= 0 || params.stride_w <= 0) {
            throw new DMLRuntimeException("Only positive strides supported");
        }

        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            if (input.isInSparseFormat() || dout.isInSparseFormat()) {
                conv2dBwdFilterSparseCount.addAndGet(1);
            } else {
                conv2dBwdFilterDenseCount.addAndGet(1);
            }
        }

        runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
    }

    /**
     * Performs the operation: ret += elem
     * @param ret left and output matrix
     * @param elem right matrix
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    private static void elementWiseInPlaceAddition(MatrixBlock ret, MatrixBlock elem) throws DMLRuntimeException {
        if (ret.getNumRows() != elem.getNumRows() || ret.getNumColumns() != elem.getNumColumns()) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        if (!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
            for (int i = 0; i < ret.getNumRows() * ret.getNumColumns(); i++) {
                ret.denseBlock[i] += elem.denseBlock[i];
            }
        } else if (!ret.isInSparseFormat() && elem.isInSparseFormat()) {
            if (!elem.isEmptyBlock()) {
                Iterator<IJV> iter = elem.sparseBlock.getIterator();
                int numCol = ret.getNumColumns();
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int index = ijv.getI() * numCol + ijv.getJ();
                    ret.denseBlock[index] += ijv.getV();
                }
            }
        } else {
            throw new DMLRuntimeException("Sparse return format not supported");
        }
    }

    /**
     * Performs the operation: ret += t(elem)
     * @param ret left and output matrix
     * @param elem right untransposed matrix
     * @param params convolution parameters
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    private static void elementWiseInPlaceTransposedAddition(MatrixBlock ret, MatrixBlock elem)
            throws DMLRuntimeException {
        if (ret.getNumRows() != elem.getNumColumns() || ret.getNumColumns() != elem.getNumRows()) {
            throw new DMLRuntimeException("Incorrect dimensions");
        }
        int numRow = ret.getNumColumns();
        if (!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
            int iter = 0;
            for (int i = 0; i < elem.getNumRows(); i++) {
                for (int j = 0; j < elem.getNumColumns(); j++, iter++) {
                    int index = j * numRow + i;
                    ret.denseBlock[index] += elem.denseBlock[iter];
                }
            }
        } else if (!ret.isInSparseFormat() && elem.isInSparseFormat()) {
            if (!elem.isEmptyBlock()) {
                Iterator<IJV> iter = elem.sparseBlock.getIterator();
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int index = ijv.getJ() * numRow + ijv.getI();
                    ret.denseBlock[index] += ijv.getV();
                }
            }
        } else {
            throw new DMLRuntimeException("Sparse return format not supported");
        }
    }

    private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params)
            throws DMLRuntimeException {
        MatrixBlock filter = params.input1;
        MatrixBlock dout = params.input2;
        doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
        dout_reshaped.recomputeNonZeros();

        MatrixBlock temp = new MatrixBlock(params.P * params.Q, params.C * params.R * params.S, false);
        long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
        long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        doCol2imOverSingleImage(n, temp, params);
        long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            loopedConvBwdDataMatMultTime.addAndGet(t2 - t1);
            loopedConvBwdDataCol2ImTime.addAndGet(t3 - t2);
        }
    }

    private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n, MatrixBlock im2ColOutBlock,
            MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params)
            throws DMLRuntimeException {
        long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        doIm2col(n, im2ColOutBlock, params);
        im2ColOutBlock.recomputeNonZeros();
        long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;

        doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
        dout_reshaped.recomputeNonZeros();

        MatrixBlock temp = new MatrixBlock(params.C * params.R * params.S, params.K, false);
        long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
        long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            loopedConvBwdFilterMatMultTime.addAndGet(t4 - t3);
            loopedConvBwdFilterIm2ColTime.addAndGet(t2 - t1);
        }
        if (!temp.isEmptyBlock())
            elementWiseInPlaceTransposedAddition(partialRetBlock, temp);
        return partialRetBlock;
    }

    private static void computeTensorIndexes(int j, int[] ret, int H, int W) throws DMLRuntimeException {
        ret[0] = j / (H * W);
        ret[1] = (j - ret[0] * (H * W)) / W;
        ret[2] = j % W;
    }

    public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock,
            ConvolutionParameters params) throws DMLRuntimeException {
        params.input1 = input;
        params.input2 = filter;
        params.output = outputBlock;

        if (input.getNumRows() != params.N || input.getNumColumns() != params.C * params.H * params.W
                || filter.getNumRows() != params.K || filter.getNumColumns() != params.C * params.R * params.S) {
            throw new DMLRuntimeException("Incorrect input to conv2d");
        }

        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            if (input.isInSparseFormat() || filter.isInSparseFormat()) {
                conv2dSparseCount.addAndGet(1);
            } else {
                conv2dDenseCount.addAndGet(1);
            }
        }

        if (!input.isInSparseFormat() && TEST_SPARSE_INPUT) {
            input.denseToSparse();
        }
        if (!filter.isInSparseFormat() && TEST_SPARSE_FILTER) {
            filter.denseToSparse();
        }

        runConvTask(TaskType.LoopedIm2ColConv2d, params);
    }

    private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params)
            throws DMLRuntimeException {
        long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
        doIm2col(n, im2ColOutBlock, params);
        im2ColOutBlock.recomputeNonZeros();
        long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;

        MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P * params.Q, false);
        LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
        long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;

        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            loopedConvIm2ColTime.addAndGet(t2 - t1);
            loopedConvMatMultTime.addAndGet(t3 - t2);
        }

        // -----------------------------------------------------------------------------
        // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware.
        // This is not required in Native implementation
        int destPos = n * params.K * params.P * params.Q;
        int length = params.K * params.P * params.Q;
        if (!matMultOutBlock.isEmptyBlock()) {
            if (matMultOutBlock.isInSparseFormat()) {
                // NOTE: Potential bottlenc to copy sparse matmult back to dense output
                Iterator<IJV> iter = matMultOutBlock.sparseBlock.getIterator();
                final int outOffset = n * params.K * params.P * params.Q;
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int k = ijv.getI();
                    int p = ijv.getJ() / params.Q;
                    int q = ijv.getJ() % params.Q;
                    params.output.denseBlock[outOffset + k * params.P * params.Q + p * params.Q + q] = ijv.getV();
                }
            } else
                System.arraycopy(matMultOutBlock.denseBlock, 0, params.output.denseBlock, destPos, length);
        }
        // -----------------------------------------------------------------------------
    }

    /**
     * This method computes the backpropogation errors for previous layer of maxpooling operation
     * 
     * @param input input matrix
     * @param dout dout matrix
     * @param outputBlock output matrix
     * @param params convolution parameters
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock,
            ConvolutionParameters params) throws DMLRuntimeException {
        params.input1 = input;
        params.input2 = dout;
        params.output = outputBlock;
        if (input.getNumColumns() != params.C * params.H * params.W || input.getNumRows() != params.N) {
            throw new DMLRuntimeException("Incorrect input dimensions in maxpooling_backward:" + input.getNumRows()
                    + " " + input.getNumColumns() + " " + params.N + " " + params.K * params.P * params.Q);
        }

        if (dout.getNumColumns() != params.C * params.P * params.Q || dout.getNumRows() != params.N) {
            throw new DMLRuntimeException("Incorrect dout dimensions in maxpooling_backward:" + input.getNumRows()
                    + " " + input.getNumColumns() + " " + params.N + " " + params.K * params.P * params.Q);
        }

        if (DMLScript.STATISTICS && DISPLAY_STATISTICS) {
            if (input.isInSparseFormat() || dout.isInSparseFormat()) {
                maxPoolBwdSparseCount.addAndGet(1);
            } else {
                maxPoolBwdDenseCount.addAndGet(1);
            }
        }

        if (params.output.isInSparseFormat())
            throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");

        fillIndexesArray(params);
        runConvTask(TaskType.MaxPooling_Backward, params);
    }

    private static void fillIndexesArray(ConvolutionParameters params) {
        params.start_indexes_h = new int[params.P];
        params.end_indexes_h = new int[params.P];
        params.start_indexes_w = new int[params.Q];
        params.end_indexes_w = new int[params.Q];
        for (int p = 0; p < params.P; p++) {
            int start_index_h = p * params.stride_h - params.pad_h;
            final int end_index_h = Math.min(start_index_h + params.R, params.H);
            start_index_h = Math.max(start_index_h, 0);
            params.start_indexes_h[p] = start_index_h;
            params.end_indexes_h[p] = end_index_h;
        }
        for (int q = 0; q < params.Q; q++) {
            int start_index_w = Math.max(q * params.stride_w - params.pad_w, 0);
            int end_index_w = Math.min(start_index_w + params.S, params.W);
            start_index_w = Math.max(start_index_w, 0);
            params.start_indexes_w[q] = start_index_w;
            params.end_indexes_w[q] = end_index_w;
        }
    }

    private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
        double[] inputArray = null;
        if (!params.input1.isInSparseFormat())
            inputArray = params.input1.getDenseBlock();
        double[] doutArray = null;
        if (!params.input2.isInSparseFormat())
            doutArray = params.input2.getDenseBlock();
        double[] outputArray = null;
        if (!params.output.isInSparseFormat())
            outputArray = params.output.getDenseBlock();
        else
            throw new DMLRuntimeException("Only dense output supported for pooling_backward");

        if (inputArray != null) {
            if (doutArray != null)
                doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
            else
                doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
        } else {
            if (doutArray != null)
                doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
            else
                doPoolingBackwardSparseSparse(n, outputArray, params);
        }
    }

    private static void doPoolingBackwardSparseDense(int n, double[] doutArray, double[] outputArray,
            ConvolutionParameters params) throws DMLRuntimeException {
        if (!params.input1.isInSparseFormat())
            throw new DMLRuntimeException("Incorrect usage: Call optimized versions");

        for (int c = 0; c < params.C; c++) {
            for (int p = 0; p < params.P; p++) {
                for (int q = 0; q < params.Q; q++) {
                    double inVal = doutArray[n * params.C * params.P * params.Q + c * params.P * params.Q
                            + p * params.Q + q];
                    if (inVal != 0) {
                        final int inputOffset = n * params.C * params.H * params.W + c * params.H * params.W;
                        int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
                        outputArray[maxIndex] += inVal;
                    }
                }
            }
        }
    }

    private static void doPoolingBackwardSparseSparse(int n, double[] outputArray, ConvolutionParameters params)
            throws DMLRuntimeException {
        if (!params.input1.isInSparseFormat())
            throw new DMLRuntimeException("Incorrect usage: Call optimized versions");

        // params.input2.isEmptyBlock() check is done by the caller
        Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n + 1);
        int[] tensorIndexes = new int[3];

        while (iter.hasNext()) {
            IJV ijv = iter.next();
            computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
            int c = tensorIndexes[0];
            int p = tensorIndexes[1];
            int q = tensorIndexes[2];

            final int inputOffset = n * params.C * params.H * params.W + c * params.H * params.W;
            int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
            outputArray[maxIndex] += ijv.getV();
        }

    }

    private static void doPoolingBackwardDenseSparse(int n, double[] inputArray, MatrixBlock dout,
            double[] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
        // dout.isEmptyBlock() check is done by the caller
        Iterator<IJV> iter = dout.sparseBlock.getIterator(n, n + 1);
        int[] tensorIndexes = new int[3];

        while (iter.hasNext()) {
            IJV ijv = iter.next();
            computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
            int c = tensorIndexes[0];
            int p = tensorIndexes[1];
            int q = tensorIndexes[2];

            final int inputOffset = n * params.C * params.H * params.W + c * params.H * params.W;
            int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
            outputArray[maxIndex] += ijv.getV();
        }
    }

    private static void doPoolingBackwardDenseDense(int n, double[] inputArray, double[] doutArray,
            double[] outputArray, ConvolutionParameters params) {
        for (int c = 0; c < params.C; c++) {
            final int inputOffset = n * params.C * params.H * params.W + c * params.H * params.W;
            final int outputOffset = n * params.C * params.P * params.Q + c * params.P * params.Q;

            for (int p = 0; p < params.P; p++) {
                for (int q = 0; q < params.Q; q++) {
                    int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
                    outputArray[maxIndex] += doutArray[outputOffset + p * params.Q + q];
                }
            }
        }
    }

    private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input,
            ConvolutionParameters params) throws DMLRuntimeException {
        if (!input.isInSparseFormat())
            throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");

        // input.isEmptyBlock() check is done by the caller
        Iterator<IJV> iter = input.sparseBlock.getIterator(n, n + 1);
        int[] tensorIndexes = new int[3];

        int start_index_h = params.start_indexes_h[p];
        int end_index_h = params.end_indexes_h[p];
        int start_index_w = params.start_indexes_w[q];
        int end_index_w = params.end_indexes_w[q];

        int maxIndex = inputOffset + start_index_h * params.W + start_index_w;
        double maxVal = -Double.MAX_VALUE;

        // Find maxIndex
        double currDoutVal = -1;
        while (iter.hasNext()) {
            IJV ijv = iter.next();
            computeTensorIndexes(ijv.getJ(), tensorIndexes, params.H, params.W);
            if (c != tensorIndexes[0])
                continue;
            int h = tensorIndexes[1];
            int w = tensorIndexes[2];
            if (h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
                currDoutVal = ijv.getV();
                if (maxVal < currDoutVal) {
                    maxIndex = inputOffset + h * params.W + w;
                    maxVal = currDoutVal;
                }
            }
        }
        return maxIndex;
    }

    private static int getMaxIndex(int p, int q, int inputOffset, double[] inputArray,
            ConvolutionParameters params) {
        int start_index_h = params.start_indexes_h[p];
        int end_index_h = params.end_indexes_h[p];
        int start_index_w = params.start_indexes_w[q];
        int end_index_w = params.end_indexes_w[q];

        int maxIndex = inputOffset + start_index_h * params.W + start_index_w;
        double maxVal = -Double.MAX_VALUE;

        // Find maxIndex
        double currDoutVal = -1;
        for (int h = start_index_h; h < end_index_h; h++) {
            for (int w = start_index_w; w < end_index_w; w++) {
                currDoutVal = inputArray[inputOffset + h * params.W + w];
                if (maxVal < currDoutVal) {
                    maxIndex = inputOffset + h * params.W + w;
                    maxVal = currDoutVal;
                }
            }
        }
        return maxIndex;
    }

    /**
     * This method computes the backpropagation errors for previous layer of relu operation
     * 
     * @param input input matrix
     * @param dout errors from next layer
     * @param outputBlock output matrix
     * @param numThreads number of threads
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static void reluBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, int numThreads)
            throws DMLRuntimeException {
        int N = input.getNumRows();
        ConvolutionParameters params = new ConvolutionParameters(N, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                numThreads);
        params.input1 = input;
        params.input2 = dout;
        params.output = outputBlock;
        if (input.getNumRows() != dout.getNumRows() || input.getNumColumns() != dout.getNumColumns()) {
            throw new DMLRuntimeException("Incorrect dimensions for relu_backward:" + input.getNumRows() + " != "
                    + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns());
        }
        runConvTask(TaskType.ReluBackward, params);
    }

    private static void doReluBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
        // (X > 0) * dout
        double[] outputArray = params.output.getDenseBlock();
        int numOutCols = params.input1.getNumColumns();

        if (!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) {
            double[] inputArr = params.input1.getDenseBlock();
            double[] doutArr = params.input2.getDenseBlock();
            for (int i = n * numOutCols; i < (n + 1) * numOutCols; i++) {
                outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0;
            }
        } else {
            // Perform (X > 0)
            if (params.input1.isInSparseFormat()) {
                Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n + 1);
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int i = ijv.getI();
                    int j = ijv.getJ();
                    outputArray[i * numOutCols + j] = ijv.getV() > 0 ? 1 : 0;
                }
            } else {
                double[] inputArr = params.input1.getDenseBlock();
                for (int i = n * numOutCols; i < (n + 1) * numOutCols; i++) {
                    outputArray[i] = inputArr[i] > 0 ? 1 : 0;
                }
            }
            // Then perform (X > 0) * dout
            if (params.input2.isInSparseFormat()) {
                Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n + 1);
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int i = ijv.getI();
                    int j = ijv.getJ();
                    outputArray[i * numOutCols + j] *= ijv.getV();
                }
            } else {
                double[] doutArr = params.input2.getDenseBlock();
                for (int i = n * numOutCols; i < (n + 1) * numOutCols; i++) {
                    outputArray[i] *= doutArr[i];
                }
            }
        }
    }

    /**
     * Performs the operation corresponding to the DML script:
     * ones = matrix(1, rows=1, cols=Hout*Wout)      
     * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
     * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
     * 
     * @param input input matrix
     * @param bias bias matrix
     * @param outputBlock output matrix
     * @param numThreads number of threads
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static void biasAdd(MatrixBlock input, MatrixBlock bias, MatrixBlock outputBlock, int numThreads)
            throws DMLRuntimeException {
        int N = input.getNumRows();
        int K = bias.getNumRows();
        int PQ = input.getNumColumns() / K;

        ConvolutionParameters params = new ConvolutionParameters(N, PQ, -1, -1, K, -1, -1, -1, -1, -1, -1,
                numThreads);
        params.input1 = input;
        params.input2 = bias;
        params.output = outputBlock;

        if (!input.isInSparseFormat() && TEST_SPARSE_INPUT) {
            input.denseToSparse();
        }
        if (!bias.isInSparseFormat() && TEST_SPARSE_FILTER) {
            bias.denseToSparse();
        }

        if (bias.getNumColumns() != 1 || input.getNumColumns() % K != 0) {
            throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + N + " X "
                    + input.getNumColumns() + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
        }

        if (input.isEmptyBlock()) {
            double[] outputArray = outputBlock.getDenseBlock();
            for (int n = 0; n < N; n++)
                fillBias(bias, outputArray, n, n + 1, N, K, PQ);
        } else {
            runConvTask(TaskType.BiasAdd, params);
        }
    }

    private static void doBiasAdd(int n1, int n2, ConvolutionParameters params) throws DMLRuntimeException {
        double[] outputArray = params.output.getDenseBlock();
        int PQ = params.C;
        int numOutCols = params.input1.getNumColumns();

        if (!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) {
            double[] inputArr = params.input1.getDenseBlock();
            double[] biasArr = params.input2.getDenseBlock();
            int K = params.K;
            int index = n1 * K * PQ;
            for (int n = n1; n < n2; n++) {
                for (int k = 0; k < K; k++) {
                    for (int pq = 0; pq < PQ; pq++, index++) {
                        outputArray[index] = inputArr[index] + biasArr[k];
                    }
                }
            }
        } else {
            fillBias(params.input2, outputArray, n1, n2, params.N, params.K, PQ);
            if (params.input1.isInSparseFormat()) {
                Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n1, n2);
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    int i = ijv.getI();
                    int j = ijv.getJ();
                    outputArray[i * numOutCols + j] += ijv.getV();
                }
            } else {
                double[] inputArr = params.input1.getDenseBlock();
                for (int i = n1 * numOutCols; i < n2 * numOutCols; i++) {
                    outputArray[i] += inputArr[i];
                }
            }
        }

    }

    private static void fillBias(MatrixBlock bias, double[] outputArray, int n1, int n2, int N, int K, int PQ) {
        if (bias.isInSparseFormat()) {
            Iterator<IJV> iter = bias.sparseBlock.getIterator();
            while (iter.hasNext()) {
                IJV ijv = iter.next();
                int k = ijv.getI();
                double val = ijv.getV();
                for (int n = n1; n < n2; n++) {
                    int fromIndex = n * K * PQ + k * PQ;
                    Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val);
                }
            }
        } else {
            double[] biasArr = bias.getDenseBlock();
            for (int n = n1; n < n2; n++) {
                for (int k = 0; k < K; k++) {
                    int fromIndex = n * K * PQ + k * PQ;
                    double val = biasArr[k];
                    Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val);
                }
            }
        }
    }

    public static void maxpooling(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params)
            throws DMLRuntimeException {
        params.input1 = input;
        params.output = outputBlock;

        if (input.getNumColumns() != params.C * params.H * params.W || input.getNumRows() != params.N) {
            throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " "
                    + input.getNumColumns() + " " + params.N + " " + params.K * params.P * params.Q);
        }

        fillIndexesArray(params);
        runConvTask(TaskType.MaxPooling_Forward, params);
    }

    private static void doPooling(int n, ConvolutionParameters params) throws DMLRuntimeException {
        double[] inputArray = null;
        if (!params.input1.isInSparseFormat())
            inputArray = params.input1.getDenseBlock();
        double[] outputArray = null;
        if (!params.output.isInSparseFormat())
            outputArray = params.output.getDenseBlock();
        else
            throw new DMLRuntimeException("Expected the output to be allocated in dense format");

        final int inOffset = n * params.C * params.H * params.W;
        int out_index = n * params.C * params.P * params.Q;
        final int HW = params.H * params.W;

        if (inputArray != null) {
            for (int c = 0; c < params.C; c++) {
                final int inOffset1 = inOffset + c * HW;
                for (int p = 0; p < params.P; p++) {
                    for (int q = 0; q < params.Q; q++, out_index++) {
                        for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
                            for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
                                outputArray[out_index] = Math.max(outputArray[out_index],
                                        inputArray[inOffset1 + h * params.W + w]);
                            }
                        }
                    }
                }
            }
        } else {
            // TODO: Optimize sparse maxpooling
            // Low priority after adding fused relu_maxpooling operator as output of conv2d expected to be dense
            for (int c = 0; c < params.C; c++) {
                for (int p = 0; p < params.P; p++) {
                    for (int q = 0; q < params.Q; q++, out_index++) {
                        for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
                            for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
                                double inVal = params.input1.quickGetValue(n, c * HW + h * params.W + w);
                                outputArray[out_index] = Math.max(outputArray[out_index], inVal);
                            }
                        }
                    }
                }
            }
        }
    }

    private static void doRotate180(int inputN, int outputN, MatrixBlock input, double[] outputArray,
            ConvolutionParameters params, boolean zeroOutSparseOutput) throws DMLRuntimeException {
        double[] inputArray = null;
        if (!input.isInSparseFormat())
            inputArray = input.getDenseBlock();
        if (outputArray == null)
            throw new DMLRuntimeException("Sparse output is not supported for rotate180");

        int outputOffset = outputN * params.K * params.P * params.Q;
        if (inputArray != null) {
            for (int k = 0; k < params.K; k++) {
                for (int p = 0; p < params.P; p++) {
                    for (int q = 0; q < params.Q; q++) {
                        outputArray[outputOffset + p * params.Q * params.K + q * params.K
                                + k] = inputArray[inputN * params.K * params.P * params.Q + k * params.P * params.Q
                                        + p * params.Q + q];
                    }
                }
            }
        } else {
            if (zeroOutSparseOutput)
                Arrays.fill(outputArray, 0);

            if (!input.isEmptyBlock()) {
                Iterator<IJV> iter = input.sparseBlock.getIterator(inputN, inputN + 1);
                int[] tensorIndexes = new int[3];
                while (iter.hasNext()) {
                    IJV ijv = iter.next();
                    computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
                    int k = tensorIndexes[0];
                    int p = tensorIndexes[1];
                    int q = tensorIndexes[2];
                    outputArray[outputOffset + p * params.Q * params.K + q * params.K + k] = ijv.getV();
                }
            }
        }
    }

    // ----------------------------------------------------------------------------------------------------------------
    private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params,
            ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks,
            ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
            ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
        for (int i = 0; i < poolSize; i++) {
            if (type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
                MatrixBlock im2ColOutBlock = new MatrixBlock(params.C * params.R * params.S, params.P * params.Q,
                        false);
                im2ColOutBlock.allocateDenseBlock(true);
                im2ColOutBlocks.add(im2ColOutBlock);
            }

            if (type == TaskType.LoopedIm2ColConv2dBwdFilter) {
                MatrixBlock partialRetBlock = new MatrixBlock(params.K, params.C * params.R * params.S, false);
                partialRetBlock.allocateDenseBlock(true);
                partialRetBlocks.add(partialRetBlock);
            }

            if (type == TaskType.LoopedIm2ColConv2dBwdData || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
                MatrixBlock doutReshapedBlock = new MatrixBlock(params.P * params.Q, params.K, false);
                doutReshapedBlock.allocateDenseBlock(true);
                doutReshapedBlocks.add(doutReshapedBlock);
            }
        }
    }

    // Methods to execute convolution-related tasks using multiple threads.
    private static void runConvTask(TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
        int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
        ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
        ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
        ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
        if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1) {
            int poolSize = Math.min(constrainedNumThreads, params.N);
            addMatrixBlocks(poolSize, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
            ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
            int NSize = params.N - poolSize;
            if (NSize >= constrainedNumThreads) {
                for (int n = 0; n < params.N; n++)
                    tasks.add(new ConvTask(n, n + 1, type, params, im2ColOutBlocks, doutReshapedBlocks,
                            partialRetBlocks));
            } else {
                int numNTasks = (int) Math.ceil(((double) NSize) / constrainedNumThreads);
                for (int n = 0; n < NSize; n += numNTasks) {
                    tasks.add(new ConvTask(n, Math.min(NSize, n + numNTasks), type, params, im2ColOutBlocks,
                            doutReshapedBlocks, partialRetBlocks));
                }
                for (int n = NSize; n < params.N; n++)
                    tasks.add(new ConvTask(n, n + 1, type, params, im2ColOutBlocks, doutReshapedBlocks,
                            partialRetBlocks));
            }

            ExecutorService pool = Executors.newFixedThreadPool(poolSize);
            List<Future<Object>> taskret;
            try {
                taskret = pool.invokeAll(tasks);
                pool.shutdown();
                for (Future<Object> task : taskret) {
                    task.get();
                }
                if (type == TaskType.LoopedIm2ColConv2dBwdFilter) {
                    for (MatrixBlock partialRetBlock : partialRetBlocks) {
                        elementWiseInPlaceAddition(params.output, partialRetBlock);
                    }
                }
            } catch (InterruptedException e) {
                throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
            } catch (ExecutionException e) {
                throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
            }
        } else {
            addMatrixBlocks(1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
            ConvTask task = new ConvTask(0, 0, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
            try {
                for (int n = 0; n < params.N; n++) {
                    task.n1 = n;
                    task.n2 = n + 1;
                    task.call();
                }
                if (type == TaskType.LoopedIm2ColConv2dBwdFilter) {
                    for (MatrixBlock partialRetBlock : partialRetBlocks) {
                        elementWiseInPlaceAddition(params.output, partialRetBlock);
                    }
                }
            } catch (Exception e) {
                throw new DMLRuntimeException("Error while executing single-threaded " + type.name(), e);
            }
        }
    }
    // ----------------------------------------------------------------------------------------------------------------

    /**
     * The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc)
     * to be executed in multi-thread manner.
     * 
     */
    private static class ConvTask implements Callable<Object> {
        public int n1;
        public int n2;
        ConvolutionParameters params;
        TaskType type;
        ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks;
        ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks;
        ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks;

        public ConvTask(int n1, int n2, TaskType type, ConvolutionParameters params,
                ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks,
                ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
                ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
            this.n1 = n1;
            this.n2 = n2;
            this.type = type;
            this.params = params;
            this.im2ColOutBlocks = im2ColOutBlocks;
            this.partialRetBlocks = partialRetBlocks;
            this.doutReshapedBlocks = doutReshapedBlocks;
        }

        @Override
        public Object call() throws DMLRuntimeException {
            switch (type) {
            case MaxPooling_Forward: {
                for (int n = n1; n < n2; n++) {
                    doPooling(n, params);
                }
                break;
            }
            case MaxPooling_Backward:
                for (int n = n1; n < n2; n++)
                    doPoolingBackward(n, params);
                break;
            case BiasAdd:
                doBiasAdd(n1, n2, params);
                break;
            case ReluBackward:
                for (int n = n1; n < n2; n++)
                    doReluBackward(n, params);
                break;
            case LoopedIm2ColConv2d: {
                MatrixBlock im2ColOutBlock = im2ColOutBlocks.remove();
                for (int n = n1; n < n2; n++)
                    doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
                im2ColOutBlocks.add(im2ColOutBlock);
                if (params.bias != null)
                    addBias(n1, n2, params);
                break;
            }
            case LoopedIm2ColConv2dBwdFilter: {
                MatrixBlock im2ColOutBlock = im2ColOutBlocks.remove();
                MatrixBlock partialRetBlock = partialRetBlocks.remove();
                MatrixBlock doutReshapedBlock = doutReshapedBlocks.remove();
                for (int n = n1; n < n2; n++)
                    partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock,
                            partialRetBlock, params);
                im2ColOutBlocks.add(im2ColOutBlock);
                partialRetBlocks.add(partialRetBlock);
                doutReshapedBlocks.add(doutReshapedBlock);
                break;
            }
            case LoopedIm2ColConv2dBwdData: {
                MatrixBlock doutReshapedBlock = doutReshapedBlocks.remove();
                for (int n = n1; n < n2; n++)
                    doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, params);
                doutReshapedBlocks.add(doutReshapedBlock);
                break;
            }
            default:
                throw new DMLRuntimeException("Unsupported ConvTask:" + type.name());
            }
            return null;
        }
    }

    private static void addBias(int n1, int n2, ConvolutionParameters params) {
        int PQ = params.P * params.Q;
        int K = params.K;
        double[] outputArr = params.output.getDenseBlock();
        if (!params.bias.isInSparseFormat()) {
            double[] biasArr = params.bias.getDenseBlock();
            int index = n1 * K * PQ;
            for (int n = n1; n < n2; n++) {
                for (int k = 0; k < K; k++) {
                    for (int pq = 0; pq < PQ; pq++, index++) {
                        outputArr[index] += biasArr[k];
                    }
                }
            }
        } else {
            Iterator<IJV> iter = params.bias.getSparseBlockIterator();
            while (iter.hasNext()) {
                IJV ijv = iter.next();
                int k = ijv.getI();
                double val = ijv.getV();
                for (int n = n1; n < n2; n++) {
                    int index = n * K * PQ + k * PQ;
                    for (int pq = 0; pq < PQ; pq++, index++) {
                        outputArr[index] += val;
                    }
                }
            }
        }
    }

    // Converts input: PQ X CRS matrix and writes to 1 X CHW
    private static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params)
            throws DMLRuntimeException {
        if (input.rlen != params.P * params.Q || input.clen != params.C * params.R * params.S) {
            throw new DMLRuntimeException("Incorrect input dimensions");
        }

        double[] outputArray = null;
        if (!params.output.isInSparseFormat())
            outputArray = params.output.getDenseBlock();
        else {
            throw new DMLRuntimeException("Only dense output is implemented");
        }

        if (!input.isInSparseFormat()) {
            double[] inputArray = input.getDenseBlock();
            doCol2IMDenseInput(0, outputN, inputArray, outputArray, params);
        } else {
            if (!input.isEmptyBlock())
                doCol2IMSparseInput(0, outputN, input.getSparseBlockIterator(), outputArray, params);
        }
    }

    private static void doCol2IMSparseInput(int inputN, int outputN, Iterator<IJV> inputIter, double[] outputArray,
            ConvolutionParameters params) throws DMLRuntimeException {
        int[] tensorIndexes = new int[3];

        while (inputIter.hasNext()) {
            IJV ijv = inputIter.next();
            computeTensorIndexes(ijv.getJ(), tensorIndexes, params.R, params.S);
            int c = tensorIndexes[0];
            int r = tensorIndexes[1];
            int s = tensorIndexes[2];
            computeTensorIndexes(ijv.getI(), tensorIndexes, params.P, params.Q);
            int p = tensorIndexes[1];
            int q = tensorIndexes[2];
            if (inputN != tensorIndexes[0]) {
                throw new DMLRuntimeException("Incorrect tensor indexes: " + inputN + " != " + tensorIndexes[0]
                        + " <" + p + " " + q + " " + ijv.getI() + params.P + " " + params.Q + ">");
            }
            int h = p * params.stride_h + r - params.pad_h;
            int w = q * params.stride_w + s - params.pad_w;
            if (h >= 0 && h < params.H && w >= 0 && w < params.W) {
                int outIndex = outputN * params.C * params.H * params.W + c * params.H * params.W + h * params.W
                        + w;
                outputArray[outIndex] += ijv.getV();
            }
        }
    }

    // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0
    // Or converts input: NPQ X CRS matrix and writes to N X CHW 
    private static void doCol2IMDenseInput(int inputN, int outputN, double[] inputArray, double[] outputArray,
            ConvolutionParameters params) throws DMLRuntimeException {
        final int outputNOffset = outputN * params.C * params.H * params.W;
        for (int p = 0; p < params.P; p++) {
            // h = p*params.stride_h + r - params.pad_h
            //   = r + hOffset
            // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get
            // max(0, - hOffset) <= r < min(params.R, params.H - hOffset)
            final int hOffset = p * params.stride_h - params.pad_h;
            final int rStart = Math.max(0, -hOffset);
            final int rEnd = Math.min(params.R, params.H - hOffset);
            for (int q = 0; q < params.Q; q++) {
                // Using the same logic as above on following:
                // w = q*params.stride_w + s - params.pad_w
                final int wOffset = q * params.stride_w - params.pad_w;
                final int sStart = Math.max(0, -wOffset);
                final int sEnd = Math.min(params.S, params.W - wOffset);
                final int tempOffset = (inputN * params.P * params.Q + p * params.Q + q) * params.C * params.R
                        * params.S;
                for (int c = 0; c < params.C; c++) {
                    final int outOffset = outputNOffset + c * params.H * params.W;
                    final int inputOffset = tempOffset + c * params.R * params.S;
                    for (int r = rStart; r < rEnd; r++) {
                        for (int s = sStart; s < sEnd; s++) {
                            int inputIndex = inputOffset + r * params.S + s;
                            int outIndex = outOffset + (hOffset + r) * params.W + wOffset + s;
                            outputArray[outIndex] += inputArray[inputIndex];
                        }
                    }
                }
            }
        }
    }

    private static void doIm2colDense(int n, double[] inputArray, double[] outputArray,
            ConvolutionParameters params) {
        int CRS = params.C * params.R * params.S;
        final int nOffset = n * params.C * params.H * params.W;
        if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) {
            for (int c = 0; c < CRS; ++c) {
                int wOffset = c % params.S;
                int hOffset = (c / params.S) % params.R;
                int cInput = c / params.R / params.S;
                for (int h = 0; h < params.P; ++h) {
                    int hPadded = h + hOffset;
                    int outOffset = (c * params.P + h) * params.Q;
                    int inputOffset = nOffset + (cInput * params.H + hPadded) * params.W;
                    System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, params.Q);
                    int w = params.Q - 1;
                    int wPadded = w + wOffset;
                    if (hPadded < params.H && wPadded < params.W)
                        outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
                    else
                        outputArray[outOffset + w] = 0;
                }
            }
        } else {
            for (int c = 0; c < CRS; ++c) {
                int wOffset = c % params.S;
                int hOffset = (c / params.S) % params.R;
                int cInput = c / params.R / params.S;
                for (int h = 0; h < params.P; ++h) {
                    int outOffset = (c * params.P + h) * params.Q;
                    int hPadded = h * params.stride_h - params.pad_h + hOffset;
                    int inputOffset = nOffset + (cInput * params.H + hPadded) * params.W;
                    if (hPadded < 0 || hPadded >= params.H) {
                        Arrays.fill(outputArray, outOffset, outOffset + params.Q, 0);
                    } else {
                        for (int w = 0; w < params.Q; ++w) {
                            int wPadded = w * params.stride_w - params.pad_w + wOffset;
                            if (wPadded >= 0 && wPadded < params.W)
                                outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
                            else
                                outputArray[outOffset + w] = 0;
                        }
                    }
                }
            }
        }
    }

    // Keeping this as a separate sparse method to allow for further dense optimizations
    private static void doIm2colSparse(int n, MatrixBlock input, double[] outputArray,
            ConvolutionParameters params) {
        int CRS = params.C * params.R * params.S;
        // final int nOffset = n * params.C*params.H*params.W;
        for (int c = 0; c < CRS; ++c) {
            int wOffset = c % params.S;
            int hOffset = (c / params.S) % params.R;
            int cInput = c / params.R / params.S;
            for (int h = 0; h < params.P; ++h) {
                int outOffset = (c * params.P + h) * params.Q;
                int hPadded = h * params.stride_h - params.pad_h + hOffset;
                int tempOffset = (cInput * params.H + hPadded) * params.W;
                // int inputOffset = nOffset + tempOffset;
                if (hPadded < 0 || hPadded >= params.H) {
                    Arrays.fill(outputArray, outOffset, outOffset + params.Q, 0);
                } else {
                    for (int w = 0; w < params.Q; ++w) {
                        int wPadded = w * params.stride_w - params.pad_w + wOffset;
                        if (wPadded >= 0 && wPadded < params.W) {
                            // NOTE: Potential performance bottleneck as we have to do binary search to getValue
                            outputArray[outOffset + w] = input.getValue(n, tempOffset + wPadded);
                        } else
                            outputArray[outOffset + w] = 0;
                    }
                }
            }
        }
    }

    private static void doIm2col(int n, MatrixBlock output, ConvolutionParameters params)
            throws DMLRuntimeException {
        double[] inputArray = null;
        if (!params.input1.isInSparseFormat())
            inputArray = params.input1.getDenseBlock();
        double[] outputArray = null;
        if (!output.isInSparseFormat())
            outputArray = output.getDenseBlock();
        else
            throw new DMLRuntimeException("Sparse output is not supported for im2col");

        if (inputArray != null)
            doIm2colDense(n, inputArray, outputArray, params);
        else
            doIm2colSparse(n, params.input1, outputArray, params);
    }

    // ------------------------------------------------------------------------------------------------
    // Used in integration tests. Please donot edit them
    public static boolean TEST_SPARSE_INPUT = false;
    public static boolean TEST_SPARSE_FILTER = false;
    // ------------------------------------------------------------------------------------------------
}