org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.sysml.runtime.instructions.gpu.context;

import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
import static jcuda.jcusparse.JCusparse.cusparseDnnz;
import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
import static jcuda.jcusparse.JCusparse.cusparseXcsrgemmNnz;
import static jcuda.jcusparse.JCusparse.cusparseXcsrgeamNnz;
import static jcuda.jcusparse.cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO;
import static jcuda.jcusparse.cusparseMatrixType.CUSPARSE_MATRIX_TYPE_GENERAL;
import static jcuda.runtime.JCuda.*;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.CacheException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
import org.apache.sysml.utils.Statistics;

import jcuda.Pointer;
// import jcuda.Sizeof;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseDirection;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.jcusparse.cusparsePointerMode;

/**
 * Handle to a matrix block on the GPU
 */
public class JCudaObject extends GPUObject {

    private static final Log LOG = LogFactory.getLog(JCudaObject.class.getName());

    /**
     * Compressed Sparse Row (CSR) format for CUDA
     * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations
     */
    public static class CSRPointer {

        public static cusparseMatDescr matrixDescriptor;

        /**
         * @return Singleton default matrix descriptor object 
         *          (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
         */
        public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
            if (matrixDescriptor == null) {
                // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
                matrixDescriptor = new cusparseMatDescr();
                cusparseCreateMatDescr(matrixDescriptor);
                cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
                cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
            }
            return matrixDescriptor;
        }

        private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;

        /**
         * Default constructor to help with Factory method {@link #allocateEmpty(long, long)}
         */
        private CSRPointer() {
            val = new Pointer();
            rowPtr = new Pointer();
            colInd = new Pointer();
            allocateMatDescrPointer();
        }

        /** Number of non zeroes                               */
        public long nnz;
        /** double array of non zero values                      */
        public Pointer val;
        /** integer array of start of all rows and end of last row + 1    */
        public Pointer rowPtr;
        /** integer array of nnz values' column indices               */
        public Pointer colInd;
        /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported   */
        public cusparseMatDescr descr;

        /** 
         * Check for ultra sparsity
         * 
         * @param rows number of rows
         * @param cols number of columns
         * @return true if ultra sparse
         */
        public boolean isUltraSparse(int rows, int cols) {
            double sp = ((double) nnz / rows / cols);
            return sp < ULTRA_SPARSITY_TURN_POINT;
        }

        /**
         * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
         * the default that works for DGEMM.
         */
        private void allocateMatDescrPointer() {
            this.descr = getDefaultCuSparseMatrixDescriptor();
        }

        /**
         * Estimate the size of a CSR matrix in GPU memory
         * Size of pointers is not needed and is not added in
         * @param nnz2   number of non zeroes
         * @param rows   number of rows 
         * @return size estimate
         */
        public static long estimateSize(long nnz2, long rows) {
            long sizeofValArray = getDoubleSizeOf(nnz2);
            long sizeofRowPtrArray = getIntSizeOf(rows + 1);
            long sizeofColIndArray = getIntSizeOf(nnz2);
            long sizeofDescr = getIntSizeOf(4);
            // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as: 
            // typedef struct {
            //    cusparseMatrixType_t MatrixType;
            //   cusparseFillMode_t FillMode;
            //   cusparseDiagType_t DiagType;
            //    cusparseIndexBase_t IndexBase;
            // } cusparseMatDescr_t;
            long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
            return tot;
        }

        /** 
         * Factory method to allocate an empty CSR Sparse matrix on the GPU
         * @param nnz2   number of non-zeroes
         * @param rows    number of rows
         * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static CSRPointer allocateEmpty(long nnz2, long rows) throws DMLRuntimeException {
            assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
            CSRPointer r = new CSRPointer();
            r.nnz = nnz2;
            if (nnz2 == 0) {
                // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
                // with no memory allocated on the GPU.
                return r;
            }
            ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
            // increment the cudaCount by 1 for the allocation of all 3 arrays
            r.val = allocate(getDoubleSizeOf(nnz2), 0);
            r.rowPtr = allocate(getIntSizeOf(rows + 1), 0);
            r.colInd = allocate(getIntSizeOf(nnz2), 1);
            return r;
        }

        /**
         * Static method to copy a CSR sparse matrix from Host to Device
         * @param dest   [input] destination location (on GPU)
         * @param rows   number of rows
         * @param nnz    number of non-zeroes
         * @param rowPtr   integer array of row pointers
         * @param colInd   integer array of column indices
         * @param values   double array of non zero values
         */
        public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd,
                double[] values) {
            CSRPointer r = dest;
            long t0 = System.nanoTime();
            r.nnz = nnz;
            cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
            cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
            cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
            Statistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
            Statistics.cudaToDevCount.addAndGet(3);
        }

        /**
         * Static method to copy a CSR sparse matrix from Device to host
         * @param src   [input] source location (on GPU)
         * @param rows   [input] number of rows
         * @param nnz   [input] number of non-zeroes
         * @param rowPtr   [output] pre-allocated integer array of row pointers of size (rows+1)
         * @param colInd   [output] pre-allocated integer array of column indices of size nnz
         * @param values   [output] pre-allocated double array of values of size nnz
         */
        public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd,
                double[] values) {
            CSRPointer r = src;
            long t0 = System.nanoTime();
            cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
            cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
            cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
            Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
            Statistics.cudaFromDevCount.addAndGet(3);
        }

        // ==============================================================================================

        // The following methods estimate the memory needed for sparse matrices that are
        // results of operations on other sparse matrices using the cuSparse Library.
        // The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
        // op = whether to transpose or not
        // binaryOperation = For cuSparse, +, - are *(matmul) are supported

        // From CuSparse Manual,
        // Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
        // to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
        // elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
        // and the total number of nonzero elements. In the second step, the user gathers nnzC
        //(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
        // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
        // nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
        // to complete matrix C.

        /**
         * Allocate row pointers of m+1 elements
         * 
         * @param handle   a valid {@link cusparseHandle}
         * @param C         Output matrix
         * @param rowsC         number of rows in C
         * @throws DMLRuntimeException
         */
        private static void step1AllocateRowPointers(cusparseHandle handle, CSRPointer C, int rowsC)
                throws DMLRuntimeException {
            cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
            cudaDeviceSynchronize();
            // Do not increment the cudaCount of allocations on GPU
            C.rowPtr = allocate(getIntSizeOf((long) rowsC + 1), 0);
        }

        /**
         * Determine total number of nonzero element for the cusparseDgeam  operation.
         * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
         * 
         * @param handle   a valid {@link cusparseHandle}
         * @param A         Sparse Matrix A on GPU
         * @param B         Sparse Matrix B on GPU
         * @param C         Output Sparse Matrix C on GPU
         * @param m         Rows in C
         * @param n         Columns in C
         * @throws DMLRuntimeException
         */
        private static void step2GatherNNZGeam(cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C,
                int m, int n) throws DMLRuntimeException {
            int[] CnnzArray = { -1 };
            cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
                    toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
            cudaDeviceSynchronize();
            if (CnnzArray[0] != -1) {
                C.nnz = CnnzArray[0];
            } else {
                int baseArray[] = { 0 };
                cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
                        cudaMemcpyDeviceToHost);
                cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
                C.nnz = CnnzArray[0] - baseArray[0];
            }
        }

        /**
         *   Determine total number of nonzero element for the cusparseDgemm operation.
         *
         * @param handle   a valid {@link cusparseHandle}
         * @param A         Sparse Matrix A on GPU
         * @param transA   op - whether A is transposed
         * @param B         Sparse Matrix B on GPU
         * @param transB   op - whether B is transposed
         * @param C         Output Sparse Matrix C on GPU
         * @param m         Number of rows of sparse matrix op ( A ) and C
         * @param n         Number of columns of sparse matrix op ( B ) and C
         * @param k         Number of columns/rows of sparse matrix op ( A ) / op ( B )
         * @throws DMLRuntimeException
         */
        private static void step2GatherNNZGemm(cusparseHandle handle, CSRPointer A, int transA, CSRPointer B,
                int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
            int[] CnnzArray = { -1 };
            if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
                throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
            }
            cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
                    B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
            cudaDeviceSynchronize();
            if (CnnzArray[0] != -1) {
                C.nnz = CnnzArray[0];
            } else {
                int baseArray[] = { 0 };
                cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
                        cudaMemcpyDeviceToHost);
                cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
                C.nnz = CnnzArray[0] - baseArray[0];
            }
        }

        /**
         * Allocate val and index pointers.
         * 
         * @param handle   a valid {@link cusparseHandle}
         * @param C         Output sparse matrix on GPU
         * @throws DMLRuntimeException
         */
        private static void step3AllocateValNInd(cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
            // Increment cudaCount by one when all three arrays of CSR sparse array are allocated
            C.val = allocate(getDoubleSizeOf(C.nnz), 0);
            C.colInd = allocate(getIntSizeOf(C.nnz), 1);
        }

        // ==============================================================================================

        /**
         * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
         * C = a op(A) + b op(B)
         * @param handle    a valid {@link cusparseHandle}
         * @param A         Sparse Matrix A on GPU
         * @param B         Sparse Matrix B on GPU
         * @param m         Rows in A
         * @param n         Columns in Bs
         * @return CSR (compressed sparse row) pointer
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static CSRPointer allocateForDgeam(cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n)
                throws DMLRuntimeException {
            if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
                throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
            }
            CSRPointer C = new CSRPointer();
            step1AllocateRowPointers(handle, C, m);
            step2GatherNNZGeam(handle, A, B, C, m, n);
            step3AllocateValNInd(handle, C);
            return C;
        }

        /**
         * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
         * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
         * @param handle   a valid {@link cusparseHandle}
         * @param A         Sparse Matrix A on GPU
         * @param transA   'T' if A is to be transposed, 'N' otherwise
         * @param B         Sparse Matrix B on GPU
         * @param transB   'T' if B is to be transposed, 'N' otherwise
         * @param m         Rows in A
         * @param n         Columns in B
         * @param k         Columns in A / Rows in B
         * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static CSRPointer allocateForMatrixMultiply(cusparseHandle handle, CSRPointer A, int transA,
                CSRPointer B, int transB, int m, int n, int k) throws DMLRuntimeException {
            // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
            // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
            CSRPointer C = new CSRPointer();
            step1AllocateRowPointers(handle, C, m);
            step2GatherNNZGemm(handle, A, transA, B, transB, C, m, n, k);
            step3AllocateValNInd(handle, C);
            return C;
        }

        /**
         * Copies this CSR matrix on the GPU to a dense column-major matrix
         * on the GPU. This is a temporary matrix for operations such as 
         * cusparseDcsrmv.
         * Since the allocated matrix is temporary, bookkeeping is not updated.
         * The caller is responsible for calling "free" on the returned Pointer object
         * @param cusparseHandle   a valid {@link cusparseHandle}
         * @param cublasHandle       a valid {@link cublasHandle}
         * @param rows      number of rows in this CSR matrix
         * @param cols      number of columns in this CSR matrix
         * @return         A {@link Pointer} to the allocated dense matrix (in column-major format)
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
                int cols) throws DMLRuntimeException {
            long size = ((long) rows) * getDoubleSizeOf((long) cols);
            Pointer A = JCudaObject.allocate(size);
            // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
            if (val != null && rowPtr != null && colInd != null && nnz > 0) {
                // Note: cusparseDcsr2dense method cannot handle empty blocks
                cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
                cudaDeviceSynchronize();
            } else {
                LOG.warn("in CSRPointer, the values array, row pointers array or column indices array was null");
            }
            return A;
        }

        /**
         * Calls cudaFree on the allocated {@link Pointer} instances
         */
        public void deallocate() {
            cudaFree(val);
            cudaFree(rowPtr);
            cudaFree(colInd);
        }
    };

    private static long getDoubleSizeOf(long numElems) {
        return numElems * ((long) jcuda.Sizeof.DOUBLE);
    }

    private static long getIntSizeOf(long numElems) {
        return numElems * ((long) jcuda.Sizeof.INT);
    }

    public synchronized boolean isAllocated() {
        return (jcudaDenseMatrixPtr != null || jcudaSparseMatrixPtr != null);
    }

    /** Pointer to dense matrix */
    public Pointer jcudaDenseMatrixPtr = null;
    /** Pointer to sparse matrix */
    public CSRPointer jcudaSparseMatrixPtr = null;

    public long numBytes;

    /** Number of bytes occupied by this block on GPU */

    /**
     * Initializes this JCudaObject with a {@link MatrixObject} instance which will contain metadata about the enclosing matrix block
     * @param m
     */
    JCudaObject(MatrixObject m) {
        super(m);
    }

    /**
     * Allocates temporary space on the device.
     * Does not update bookkeeping.
     * The caller is responsible for freeing up after usage.
     * @param size            Size of data (in bytes) to allocate
     * @param statsCount   amount to increment the cudaAllocCount by
     * @return jcuda Pointer
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static Pointer allocate(long size, int statsCount) throws DMLRuntimeException {
        Pointer A = new Pointer();
        ensureFreeSpace(size);
        long t0 = System.nanoTime();
        cudaMalloc(A, size);
        // Set all elements to 0 since newly allocated space will contain garbage
        cudaMemset(A, 0, size);
        Statistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
        Statistics.cudaAllocCount.getAndAdd(statsCount);
        return A;
    }

    /**
     * Convenience method for {@link #allocate(long, int)}, defaults statsCount to 1.
     * @param size size of data (in bytes) to allocate
     * @return jcuda pointer
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static Pointer allocate(long size) throws DMLRuntimeException {
        return allocate(size, 1);
    }

    /**
     * Allocates a sparse and empty {@link JCudaObject}
     * This is the result of operations that are both non zero matrices.
     * 
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public void allocateSparseAndEmpty() throws DMLRuntimeException {
        setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(0, mat.getNumRows()));
        setDeviceModify(0);
    }

    /**
     * Allocates a dense matrix of size obtained from the attached matrix metadata
     * and fills it up with a single value
     * 
     * @param v value to fill up the dense matrix
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public void allocateAndFillDense(double v) throws DMLRuntimeException {
        long rows = mat.getNumRows();
        long cols = mat.getNumColumns();
        int numElems = toIntExact(rows * cols);
        long size = getDoubleSizeOf(numElems);
        setDenseMatrixCudaPointer(allocate(size));
        setDeviceModify(size);
        // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
        LibMatrixCUDA.kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
                jcudaDenseMatrixPtr, v, numElems);
    }

    /**
     * If this {@link JCudaObject} is sparse and empty
     * Being allocated is a prerequisite to being sparse and empty.
     * 
     * @return true if sparse and empty
     */
    public boolean isSparseAndEmpty() {
        boolean isSparseAndAllocated = isAllocated() && LibMatrixCUDA.isInSparseFormat(mat);
        boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && jcudaSparseMatrixPtr.nnz == 0;
        return isEmptyAndSparseAndAllocated;
    }

    /**
     * Allocate necessary memory on the GPU for this {@link JCudaObject} instance.
     * 
     * @param isInput if the block is input, isSparse argument is ignored
     * @param isSparse if the block is sparse
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    private void prepare(boolean isInput, boolean isSparse) throws DMLRuntimeException {
        if (isAllocated()) {
            // Already allocated on GPU and expected to be in sync
        } else {
            if (isInput) {
                copyFromHostToDevice();
            } else {
                mat.setDirty(true);
                // Don't copy just allocate
                if (isSparse) {
                    allocateSparseMatrixOnDevice();
                } else { // Dense block, size = numRows * numCols
                    allocateDenseMatrixOnDevice();
                }
                synchronized (evictionLock) {
                    GPUContext.allocatedPointers.add(this);
                }
            }
        }
        numLocks.addAndGet(1);
    }

    @Override
    public synchronized void acquireDeviceRead() throws DMLRuntimeException {
        prepare(true, false);
        if (!isAllocated())
            throw new DMLRuntimeException("Expected device data to be allocated");
    }

    @Override
    public synchronized void acquireDeviceModifyDense() throws DMLRuntimeException {
        prepare(false, false);
        isDeviceCopyModified = true;
        if (!isAllocated())
            throw new DMLRuntimeException("Expected device data to be allocated");
    }

    @Override
    public synchronized void acquireDeviceModifySparse() throws DMLRuntimeException {
        isInSparseFormat = true;
        prepare(false, true);
        isDeviceCopyModified = true;
        if (!isAllocated())
            throw new DMLRuntimeException("Expected device data to be allocated");
    }

    @Override
    public synchronized void acquireHostRead() throws CacheException {
        if (isAllocated()) {
            try {
                if (isDeviceCopyModified) {
                    copyFromDeviceToHost();
                }
            } catch (DMLRuntimeException e) {
                throw new CacheException(e);
            }
        } else {
            throw new CacheException(
                    "Cannot perform acquireHostRead as the GPU data is not allocated:" + mat.getVarName());
        }
    }

    @Override
    public synchronized void acquireHostModify() throws CacheException {
        if (isAllocated()) {
            try {
                if (isDeviceCopyModified) {
                    throw new DMLRuntimeException("Potential overwrite of GPU data");
                    // copyFromDeviceToHost();
                }
                clearData();
            } catch (DMLRuntimeException e) {
                throw new CacheException(e);
            }
        }
    }

    /**
     * updates the locks depending on the eviction policy selected
     * @throws CacheException if there is no locked GPU Object
     */
    private void updateReleaseLocks() throws CacheException {
        if (numLocks.addAndGet(-1) < 0) {
            throw new CacheException("Redundant release of GPU object");
        }
        if (evictionPolicy == EvictionPolicy.LRU) {
            timestamp.set(System.nanoTime());
        } else if (evictionPolicy == EvictionPolicy.LFU) {
            timestamp.addAndGet(1);
        } else if (evictionPolicy == EvictionPolicy.MIN_EVICT) {
            // Do Nothing
        } else {
            throw new CacheException("The eviction policy is not supported:" + evictionPolicy.name());
        }
    }

    /**
     * releases input allocated on GPU
     * @throws CacheException if data is not allocated
     */
    public synchronized void releaseInput() throws CacheException {
        updateReleaseLocks();
        if (!isAllocated())
            throw new CacheException("Attempting to release an input before allocating it");
    }

    /**
    @Override
    void allocateMemoryOnDevice(long numElemToAllocate) throws DMLRuntimeException {
       if(!isAllocated()) {
     long start = System.nanoTime();
     if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) {
        setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows()));
        numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
        JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
        isInSparseFormat = true;
        //throw new DMLRuntimeException("Sparse format not implemented");
     } else if(numElemToAllocate == -1) {
        // Called for dense input
        setDenseMatrixCudaPointer(new Pointer());
        numBytes = mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns());
        cudaMalloc(jcudaDenseMatrixPtr, numBytes);
        JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
     }
     else {
        // Called for dense output
        setDenseMatrixCudaPointer(new Pointer());
        numBytes = getDoubleSizeOf(numElemToAllocate);
        if(numElemToAllocate <= 0 || numBytes <= 0)
           throw new DMLRuntimeException("Cannot allocate dense matrix object with " + numElemToAllocate + " elements and size " + numBytes);
        cudaMalloc(jcudaDenseMatrixPtr,  numBytes);
        JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
     }
        
     Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
     Statistics.cudaAllocCount.addAndGet(1);
        
       }
    }
     */

    @Override
    void allocateDenseMatrixOnDevice() throws DMLRuntimeException {
        assert !isAllocated() : "Internal error - trying to allocated dense matrix to a JCudaObject that is already allocated";
        long rows = mat.getNumRows();
        long cols = mat.getNumColumns();
        assert rows > 0 : "Internal error - invalid number of rows when allocating dense matrix";
        assert cols > 0 : "Internal error - invalid number of columns when allocating dense matrix;";
        long size = getDoubleSizeOf(rows * cols);
        Pointer tmp = allocate(size);
        setDenseMatrixCudaPointer(tmp);
        setDeviceModify(size);
    }

    @Override
    void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
        assert !isAllocated() : "Internal error = trying to allocated sparse matrix to a JCudaObject that is already allocated";
        long rows = mat.getNumRows();
        long nnz = mat.getNnz();
        assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
        assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
        CSRPointer tmp = CSRPointer.allocateEmpty(nnz, rows);
        setSparseMatrixCudaPointer(tmp);
        long size = CSRPointer.estimateSize(nnz, rows);
        setDeviceModify(size);
    }

    /**
     * releases output allocated on GPU
     * @throws CacheException if data is not allocated
     */
    @Override
    public synchronized void releaseOutput() throws CacheException {
        updateReleaseLocks();
        isDeviceCopyModified = true;
        if (!isAllocated())
            throw new CacheException("Attempting to release an output before allocating it");
    }

    @Override
    public void setDeviceModify(long numBytes) {
        this.numLocks.addAndGet(1);
        this.numBytes = numBytes;
        JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
    }

    @Override
    void deallocateMemoryOnDevice() {
        if (jcudaDenseMatrixPtr != null) {
            long start = System.nanoTime();
            cudaFree(jcudaDenseMatrixPtr);
            JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
            Statistics.cudaDeAllocTime.addAndGet(System.nanoTime() - start);
            Statistics.cudaDeAllocCount.addAndGet(1);
        }
        if (jcudaSparseMatrixPtr != null) {
            long start = System.nanoTime();
            jcudaSparseMatrixPtr.deallocate();
            JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
            Statistics.cudaDeAllocTime.addAndGet(System.nanoTime() - start);
            Statistics.cudaDeAllocCount.addAndGet(1);
        }
        jcudaDenseMatrixPtr = null;
        jcudaSparseMatrixPtr = null;
        numLocks.set(0);
    }

    /** 
     * Thin wrapper over {@link #evict(long)}
     * @param size size to check
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    static void ensureFreeSpace(long size) throws DMLRuntimeException {
        if (size >= getAvailableMemory()) {
            evict(size);
        }
    }

    @Override
    void copyFromHostToDevice() throws DMLRuntimeException {
        printCaller();
        long start = System.nanoTime();

        MatrixBlock tmp = mat.acquireRead();
        if (tmp.isInSparseFormat()) {

            int rowPtr[] = null;
            int colInd[] = null;
            double[] values = null;

            tmp.recomputeNonZeros();
            long nnz = tmp.getNonZeros();
            mat.getMatrixCharacteristics().setNonZeros(nnz);

            SparseBlock block = tmp.getSparseBlock();
            boolean copyToDevice = true;
            if (block == null && tmp.getNonZeros() == 0) {
                //            // Allocate empty block --> not necessary
                //            // To reproduce this, see org.apache.sysml.test.integration.applications.dml.ID3DMLTest
                //            rowPtr = new int[0];
                //            colInd = new int[0];
                //            values = new double[0];
                copyToDevice = false;
            } else if (block == null && tmp.getNonZeros() != 0) {
                throw new DMLRuntimeException("Expected CP sparse block to be not null.");
            } else {
                // CSR is the preferred format for cuSparse GEMM
                // Converts MCSR and COO to CSR
                SparseBlockCSR csrBlock = null;
                if (block instanceof SparseBlockCSR) {
                    csrBlock = (SparseBlockCSR) block;
                } else if (block instanceof SparseBlockCOO) {
                    // TODO - should we do this on the GPU using cusparse<t>coo2csr() ?
                    long t0 = System.nanoTime();
                    SparseBlockCOO cooBlock = (SparseBlockCOO) block;
                    csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(),
                            cooBlock.indexes(), cooBlock.values());
                    Statistics.cudaConversionTime.addAndGet(System.nanoTime() - t0);
                    Statistics.cudaConversionCount.incrementAndGet();
                } else if (block instanceof SparseBlockMCSR) {
                    long t0 = System.nanoTime();
                    SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
                    csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
                    Statistics.cudaConversionTime.addAndGet(System.nanoTime() - t0);
                    Statistics.cudaConversionCount.incrementAndGet();
                } else {
                    throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                }
                rowPtr = csrBlock.rowPointers();
                colInd = csrBlock.indexes();
                values = csrBlock.values();
            }
            allocateSparseMatrixOnDevice();
            synchronized (evictionLock) {
                GPUContext.allocatedPointers.add(this);
            }
            if (copyToDevice) {
                CSRPointer.copyToDevice(jcudaSparseMatrixPtr, tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
                        values);
            }
            // throw new DMLRuntimeException("Sparse matrix is not implemented");
            // tmp.sparseToDense();
        } else {
            double[] data = tmp.getDenseBlock();

            if (data == null && tmp.getSparseBlock() != null)
                throw new DMLRuntimeException("Incorrect sparsity calculation");
            else if (data == null && tmp.getNonZeros() != 0)
                throw new DMLRuntimeException("MatrixBlock is not allocated");
            else if (tmp.getNonZeros() == 0)
                data = new double[tmp.getNumRows() * tmp.getNumColumns()];

            // Copy dense block
            allocateDenseMatrixOnDevice();
            synchronized (evictionLock) {
                GPUContext.allocatedPointers.add(this);
            }
            cudaMemcpy(jcudaDenseMatrixPtr, Pointer.to(data),
                    getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice);
        }

        mat.release();

        Statistics.cudaToDevTime.addAndGet(System.nanoTime() - start);
        Statistics.cudaToDevCount.addAndGet(1);
    }

    public static int toIntExact(long l) throws DMLRuntimeException {
        if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
            throw new DMLRuntimeException("Cannot be cast to int:" + l);
        }
        return (int) l;
    }

    @Override
    protected void copyFromDeviceToHost() throws DMLRuntimeException {
        if (jcudaDenseMatrixPtr != null && jcudaSparseMatrixPtr != null) {
            throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
        }

        if (jcudaDenseMatrixPtr != null) {
            printCaller();
            long start = System.nanoTime();
            MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
            tmp.allocateDenseBlock();
            double[] data = tmp.getDenseBlock();

            cudaMemcpy(Pointer.to(data), jcudaDenseMatrixPtr, getDoubleSizeOf(data.length), cudaMemcpyDeviceToHost);

            tmp.recomputeNonZeros();
            mat.acquireModify(tmp);
            mat.release();

            Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
            Statistics.cudaFromDevCount.addAndGet(1);
        } else if (jcudaSparseMatrixPtr != null) {
            printCaller();
            if (!LibMatrixCUDA.isInSparseFormat(mat))
                throw new DMLRuntimeException(
                        "Block not in sparse format on host yet the device sparse matrix pointer is not null");

            if (this.isSparseAndEmpty()) {
                MatrixBlock tmp = new MatrixBlock(); // Empty Block
                mat.acquireModify(tmp);
                mat.release();
            } else {
                long start = System.nanoTime();

                int rows = toIntExact(mat.getNumRows());
                int cols = toIntExact(mat.getNumColumns());
                int nnz = toIntExact(jcudaSparseMatrixPtr.nnz);
                int[] rowPtr = new int[rows + 1];
                int[] colInd = new int[nnz];
                double[] values = new double[nnz];
                CSRPointer.copyToHost(jcudaSparseMatrixPtr, rows, nnz, rowPtr, colInd, values);

                SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
                MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
                mat.acquireModify(tmp);
                mat.release();
                Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
                Statistics.cudaFromDevCount.addAndGet(1);
            }
        } else {
            throw new DMLRuntimeException(
                    "Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
        }
        isDeviceCopyModified = false;
    }

    @Override
    protected long getSizeOnDevice() throws DMLRuntimeException {
        long GPUSize = 0;
        long rlen = mat.getNumRows();
        long clen = mat.getNumColumns();
        long nnz = mat.getNnz();

        if (LibMatrixCUDA.isInSparseFormat(mat)) {
            GPUSize = CSRPointer.estimateSize(nnz, rlen);
        } else {
            GPUSize = getDoubleSizeOf(rlen * clen);
        }
        return GPUSize;
    }

    private String getClassAndMethod(StackTraceElement st) {
        String[] str = st.getClassName().split("\\.");
        return str[str.length - 1] + "." + st.getMethodName();
    }

    /**
     * Convenience debugging method.
     * Checks {@link JCudaContext#DEBUG} flag before printing to System.out
     */
    private void printCaller() {
        if (JCudaContext.DEBUG) {
            StackTraceElement[] st = Thread.currentThread().getStackTrace();
            String ret = getClassAndMethod(st[1]);
            for (int i = 2; i < st.length && i < 7; i++) {
                ret += "->" + getClassAndMethod(st[i]);
            }
            System.out.println("CALL_STACK:" + ret);
        }
    }

    /**
     * Convenience method to directly examine the Sparse matrix on GPU
     * 
     * @return CSR (compressed sparse row) pointer
     */
    public CSRPointer getSparseMatrixCudaPointer() {
        return jcudaSparseMatrixPtr;
    }

    /**
     * Convenience method to directly set the sparse matrix on GPU
     * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing.
     * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
     * @param sparseMatrixPtr CSR (compressed sparse row) pointer
     */
    public synchronized void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) {
        this.jcudaSparseMatrixPtr = sparseMatrixPtr;
        this.isInSparseFormat = true;
        if (jcudaDenseMatrixPtr != null) {
            cudaFree(jcudaDenseMatrixPtr);
            jcudaDenseMatrixPtr = null;
        }
    }

    /**
     * Convenience method to directly set the dense matrix pointer on GPU
     * Make sure to call {@link #setDeviceModify(long)} after this to set appropriate state, if you are not sure what you are doing.
     * 
     * @param densePtr dense pointer
     */
    public synchronized void setDenseMatrixCudaPointer(Pointer densePtr) {
        this.jcudaDenseMatrixPtr = densePtr;
        this.isInSparseFormat = false;
        if (jcudaSparseMatrixPtr != null) {
            jcudaSparseMatrixPtr.deallocate();
            jcudaSparseMatrixPtr = null;
        }
    }

    /**
     * Converts this JCudaObject from dense to sparse format.
     * 
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public void denseToSparse() throws DMLRuntimeException {
        cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
        if (cusparseHandle == null)
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        int rows = toIntExact(mat.getNumRows());
        int cols = toIntExact(mat.getNumColumns());

        if (jcudaDenseMatrixPtr == null || !isAllocated())
            throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");

        convertDensePtrFromRowMajorToColumnMajor();
        setSparseMatrixCudaPointer(
                columnMajorDenseToRowMajorSparse(cusparseHandle, rows, cols, jcudaDenseMatrixPtr));
        // TODO: What if mat.getNnz() is -1 ?
        numBytes = CSRPointer.estimateSize(mat.getNnz(), rows);
    }

    /**
     * Transposes a dense matrix on the GPU by calling the cublasDgeam operation
     * @param densePtr   Pointer to dense matrix on the GPU
     * @param m         rows in ouput matrix
     * @param n         columns in output matrix
     * @param lda      rows in input matrix
     * @param ldc      columns in output matrix
     * @return         transposed matrix
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static Pointer transpose(Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
        Pointer alpha = LibMatrixCUDA.pointerTo(1.0);
        Pointer beta = LibMatrixCUDA.pointerTo(0.0);
        Pointer A = densePtr;
        Pointer C = JCudaObject.allocate(((long) m) * getDoubleSizeOf(n));

        // Transpose the matrix to get a dense matrix
        JCublas2.cublasDgeam(LibMatrixCUDA.cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta,
                new Pointer(), lda, C, ldc);
        return C;
    }

    /**
     * Convenience method. Converts Row Major Dense Matrix --> Column Major Dense Matrix
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    private void convertDensePtrFromRowMajorToColumnMajor() throws DMLRuntimeException {
        int m = toIntExact(mat.getNumRows());
        int n = toIntExact(mat.getNumColumns());
        int lda = n;
        int ldc = m;
        if (!isAllocated()) {
            throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
        }

        Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
        cudaFree(jcudaDenseMatrixPtr);
        setDenseMatrixCudaPointer(tmp);
    }

    private void convertDensePtrFromColMajorToRowMajor() throws DMLRuntimeException {
        int n = toIntExact(mat.getNumRows());
        int m = toIntExact(mat.getNumColumns());
        int lda = n;
        int ldc = m;
        if (!isAllocated()) {
            throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
        }

        Pointer tmp = transpose(jcudaDenseMatrixPtr, m, n, lda, ldc);
        cudaFree(jcudaDenseMatrixPtr);
        setDenseMatrixCudaPointer(tmp);
    }

    /**
     * Convert sparse to dense (Performs transpose, use sparseToColumnMajorDense if the kernel can deal with column major format)
     * 
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public void sparseToDense() throws DMLRuntimeException {
        if (jcudaSparseMatrixPtr == null || !isAllocated())
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");

        sparseToColumnMajorDense();
        convertDensePtrFromColMajorToRowMajor();
    }

    /**
     * More efficient method to convert sparse to dense but returns dense in column major format
     * 
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public void sparseToColumnMajorDense() throws DMLRuntimeException {
        if (jcudaSparseMatrixPtr == null || !isAllocated())
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");

        cusparseHandle cusparseHandle = LibMatrixCUDA.cusparseHandle;
        if (cusparseHandle == null)
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        int rows = toIntExact(mat.getNumRows());
        int cols = toIntExact(mat.getNumColumns());
        setDenseMatrixCudaPointer(jcudaSparseMatrixPtr.toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
        numBytes = ((long) rows) * getDoubleSizeOf(cols);
    }

    /**
     * Convenience method to convert a CSR matrix to a dense matrix on the GPU
     * Since the allocated matrix is temporary, bookkeeping is not updated.
     * Also note that the input dense matrix is expected to be in COLUMN MAJOR FORMAT
     * Caller is responsible for deallocating memory on GPU.
     * 
     * @param cusparseHandle handle to cusparse library
     * @param rows number of rows
     * @param cols number of columns
     * @param densePtr [in] dense matrix pointer on the GPU in row major
     * @return CSR (compressed sparse row) pointer
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static CSRPointer columnMajorDenseToRowMajorSparse(cusparseHandle cusparseHandle, int rows, int cols,
            Pointer densePtr) throws DMLRuntimeException {
        cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
        Pointer nnzPerRowPtr = new Pointer();
        Pointer nnzTotalDevHostPtr = new Pointer();

        ensureFreeSpace(getIntSizeOf(rows + 1));

        long t1 = System.nanoTime();
        cudaMalloc(nnzPerRowPtr, getIntSizeOf(rows));
        cudaMalloc(nnzTotalDevHostPtr, getIntSizeOf(1));
        Statistics.cudaAllocTime.addAndGet(System.nanoTime() - t1);
        Statistics.cudaAllocCount.addAndGet(2);

        // Output is in dense vector format, convert it to CSR
        cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
                nnzPerRowPtr, nnzTotalDevHostPtr);
        cudaDeviceSynchronize();
        int[] nnzC = { -1 };

        long t2 = System.nanoTime();
        cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
        Statistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
        Statistics.cudaFromDevCount.addAndGet(2);

        if (nnzC[0] == -1) {
            throw new DMLRuntimeException(
                    "cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
        }

        CSRPointer C = CSRPointer.allocateEmpty(nnzC[0], rows);
        cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
                C.colInd);
        cudaDeviceSynchronize();

        cudaFree(nnzPerRowPtr);
        cudaFree(nnzTotalDevHostPtr);

        return C;
    }

    /**
     * Gets the double array from GPU memory onto host memory and returns string.
     * @param A Pointer to memory on device (GPU), assumed to point to a double array
     * @param rows rows in matrix A
      * @param cols columns in matrix A
     * @return the debug string
     * @throws DMLRuntimeException  if DMLRuntimeException occurs
     */
    public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
        StringBuffer sb = new StringBuffer();
        int len = toIntExact(rows * cols);
        double[] tmp = new double[len];
        cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
        int k = 0;
        for (int i = 0; i < rows; i++) {
            for (int j = 0; j < cols; j++) {
                sb.append(tmp[k]).append(' ');
                k++;
            }
            sb.append('\n');
        }
        return sb.toString();
    }
}