Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.matrix.data; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.math3.util.FastMath; import org.apache.sysml.lops.MapMultChain.ChainType; import org.apache.sysml.lops.WeightedCrossEntropy.WCeMMType; import org.apache.sysml.lops.WeightedDivMM.WDivMMType; import org.apache.sysml.lops.WeightedSigmoid.WSigmoidType; import org.apache.sysml.lops.WeightedSquaredLoss.WeightsType; import org.apache.sysml.lops.WeightedUnaryMM.WUMMType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.functionobjects.ValueFunction; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.UtilFunctions; /** * MB: * Library for matrix multiplications including MM, MV, VV for all * combinations of dense, sparse, ultrasparse representations and special * operations such as transpose-self matrix multiplication. * * In general all implementations use internally dense outputs * for direct access, but change the final result to sparse if necessary. * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid. * * NOTES on BLAS: * * Experiments in 04/2013 showed that even on dense-dense this implementation * is 3x faster than f2j-BLAS-DGEMM, 2x faster than f2c-BLAS-DGEMM, and * level (+10% after JIT) with a native C implementation. * * Calling native BLAS would loose platform independence and would require * JNI calls incl data transfer. Furthermore, BLAS does not support sparse * matrices (except Sparse BLAS, with dedicated function calls and matrix formats) * and would be an external dependency. * * Experiments in 02/2014 showed that on dense-dense this implementation now achieves * almost 30% peak FP performance. Compared to Intel MKL 11.1 (dgemm, N=1000) it is * just 3.2x (sparsity=1.0) and 1.9x (sparsity=0.5) slower, respectively. * */ public class LibMatrixMult { //internal configuration private static final boolean LOW_LEVEL_OPTIMIZATION = true; private static final long MEM_OVERHEAD_THRESHOLD = 2L * 1024 * 1024; //MAX 2 MB private static final long PAR_MINFLOP_THRESHOLD = 2L * 1024 * 1024; //MIN 2 MFLOP private static final int L2_CACHESIZE = 256 * 1024; //256KB (common size) private LibMatrixMult() { //prevent instantiation via private constructor } //////////////////////////////// // public matrix mult interface //////////////////////////////// /** * Performs a matrix multiplication and stores the result in the output matrix. * * All variants use a IKJ access pattern, and internally use dense output. After the * actual computation, we recompute nnz and check for sparse/dense representation. * * * @param m1 first matrix * @param m2 second matrix * @param ret result matrix * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) throws DMLRuntimeException { matrixMult(m1, m2, ret, 0, m1.rlen); } /** * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally * consumed by another dense instruction, which makes repeated conversion to sparse wasteful. * This should be used in rare cases and if you are unsure, * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead. * * @param m1 first matrix * @param m2 second matrix * @param ret result matrix * @param examSparsity if false, sparsity examination is disabled * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean examSparsity) throws DMLRuntimeException { matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity); } public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) throws DMLRuntimeException { matrixMult(m1, m2, ret, rl, ru, true); } public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) throws DMLRuntimeException { //check inputs / outputs if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing: output allocation boolean tm2 = checkPrepMatrixMultRightInput(m1, m2); m2 = prepMatrixMultRightInput(m1, m2); ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse()); if (!ret.sparse) ret.allocateDenseBlock(); //prepare row-upper for special cases of vector-matrix boolean pm2 = checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE); int ru2 = (pm2 && ru == m1.rlen) ? m2.rlen : ru; int cu = m2.clen; //core matrix mult computation if (m1.isUltraSparse() || m2.isUltraSparse()) matrixMultUltraSparse(m1, m2, ret, 0, ru2); else if (!m1.sparse && !m2.sparse) matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu); else if (m1.sparse && m2.sparse) matrixMultSparseSparse(m1, m2, ret, pm2, 0, ru2); else if (m1.sparse) matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2); else matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2); //post-processing: nnz/representation if (!ret.sparse) ret.recomputeNonZeros(); if (examSparsity) ret.examSparsity(); //System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); } /** * Performs a multi-threaded matrix multiplication and stores the result in the output matrix. * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen). * * @param m1 first matrix * @param m2 second matrix * @param ret result matrix * @param k maximum parallelism * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException { //check inputs / outputs if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //check too high additional vector-matrix memory requirements (fallback to sequential) //check too small workload in terms of flops (fallback to sequential too) if (m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen == 1 || m1.isUltraSparse() || m2.isUltraSparse()) || 2L * m1.rlen * m1.clen * m2.clen < PAR_MINFLOP_THRESHOLD) { matrixMult(m1, m2, ret); return; } //Timing time = new Timing(true); //pre-processing: output allocation (in contrast to single-threaded, //we need to allocate sparse as well in order to prevent synchronization) boolean tm2 = checkPrepMatrixMultRightInput(m1, m2); m2 = prepMatrixMultRightInput(m1, m2); ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse()); if (!ret.sparse) ret.allocateDenseBlock(); else ret.allocateSparseRowsBlock(); if (!ret.isThreadSafe()) { matrixMult(m1, m2, ret); return; } //prepare row-upper for special cases of vector-matrix / matrix-matrix boolean pm2r = checkParMatrixMultRightInputRows(m1, m2, k); boolean pm2c = checkParMatrixMultRightInputCols(m1, m2, k, pm2r); int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen; //core multi-threaded matrix mult computation //(currently: always parallelization over number of rows) try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultTask> tasks = new ArrayList<MatrixMultTask>(); int nk = (pm2r || pm2c) ? k : UtilFunctions.roundToNext(Math.min(8 * k, num / 32), k); ArrayList<Integer> blklens = getBalancedBlockSizes(num, nk); for (int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++) tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, lb, lb + blklens.get(i))); //execute tasks List<Future<Object>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial results (nnz, ret for vector/matrix) ret.nonZeros = 0; //reset after execute for (Future<Object> task : taskret) { if (pm2r) vectAdd((double[]) task.get(), ret.denseBlock, 0, 0, ret.rlen * ret.clen); else ret.nonZeros += (Long) task.get(); } if (pm2r) ret.recomputeNonZeros(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } //post-processing (nnz maintained in parallel) ret.examSparsity(); //System.out.println("MM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); } /** * Performs a matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)). * * All variants use a IKJ access pattern, and internally use dense output. After the * actual computation, we recompute nnz and check for sparse/dense representation. * * @param mX X matrix * @param mV v matrix * @param mW w matrix * @param ret result matrix * @param ct chain type * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct) throws DMLRuntimeException { //check inputs / outputs (after that mV and mW guaranteed to be dense) if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || (mW != null && mW.isEmptyBlock(false))) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing: output allocation ret.sparse = false; ret.allocateDenseBlock(); //core matrix mult chain computation if (mX.sparse) matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen); else matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen); //post-processing ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } /** * Performs a parallel matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)). * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen). * * NOTE: This multi-threaded mmchain operation has additional memory requirements of k*ncol(X)*8bytes * for partial aggregation. Current max memory: 256KB; otherwise redirectly to sequential execution. * * @param mX X matrix * @param mV v matrix * @param mW w matrix * @param ret result matrix * @param ct chain type * @param k maximum parallelism * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int k) throws DMLRuntimeException { //check inputs / outputs (after that mV and mW guaranteed to be dense) if (mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || (mW != null && mW.isEmptyBlock(false))) { ret.examSparsity(); //turn empty dense into sparse return; } //check too high additional memory requirements (fallback to sequential) //check too small workload in terms of flops (fallback to sequential too) if (8L * mV.rlen * k > MEM_OVERHEAD_THRESHOLD || 4L * mX.rlen * mX.clen < PAR_MINFLOP_THRESHOLD) { matrixMultChain(mX, mV, mW, ret, ct); return; } //Timing time = new Timing(true); //pre-processing (no need to check isThreadSafe) ret.sparse = false; ret.allocateDenseBlock(); //core matrix mult chain computation //(currently: always parallelization over number of rows) try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultChainTask> tasks = new ArrayList<MatrixMultChainTask>(); int blklen = (int) (Math.ceil((double) mX.rlen / k)); blklen += (blklen % 24 != 0) ? 24 - blklen % 24 : 0; for (int i = 0; i < k & i * blklen < mX.rlen; i++) tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, i * blklen, Math.min((i + 1) * blklen, mX.rlen))); //execute tasks List<Future<double[]>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial results for (Future<double[]> task : taskret) vectAdd(task.get(), ret.denseBlock, 0, 0, mX.clen); } catch (Exception ex) { throw new DMLRuntimeException(ex); } //post-processing ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("MMChain "+ct.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose) throws DMLRuntimeException { //check inputs / outputs if (m1.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose); ret.sparse = false; ret.allocateDenseBlock(); if (m1.sparse) matrixMultTransposeSelfSparse(m1, ret, leftTranspose, 0, ret.rlen); else matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen); //post-processing copyUpperToLowerTriangle(ret); ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("TSMM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop()); } public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k) throws DMLRuntimeException { //check inputs / outputs if (m1.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //check no parallelization benefit (fallback to sequential) //check too small workload in terms of flops (fallback to sequential too) if (ret.rlen == 1 || leftTranspose && 1L * m1.rlen * m1.clen * m1.clen < PAR_MINFLOP_THRESHOLD || !leftTranspose && 1L * m1.clen * m1.rlen * m1.rlen < PAR_MINFLOP_THRESHOLD) { matrixMultTransposeSelf(m1, ret, leftTranspose); return; } //Timing time = new Timing(true); //pre-processing (no need to check isThreadSafe) m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose); ret.sparse = false; ret.allocateDenseBlock(); //core multi-threaded matrix mult computation try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<MatrixMultTransposeTask>(); //load balance via #tasks=2k due to triangular shape int blklen = (int) (Math.ceil((double) ret.rlen / (2 * k))); for (int i = 0; i < 2 * k & i * blklen < ret.rlen; i++) tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i * blklen, Math.min((i + 1) * blklen, ret.rlen))); pool.invokeAll(tasks); pool.shutdown(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } //post-processing copyUpperToLowerTriangle(ret); ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop()); } public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2) throws DMLRuntimeException { //check inputs / outputs if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) return; //Timing time = new Timing(true); //pre-processing ret1.sparse = (m2.sparse || ret1.sparse); if (ret1.sparse) ret1.allocateSparseRowsBlock(); else ret1.allocateDenseBlock(); //core permutation mm computation if (m2.sparse) matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen); else if (ret1.sparse) matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen); else matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen); //post-processing ret1.recomputeNonZeros(); ret1.examSparsity(); if (ret2 != null) { //optional second output ret2.recomputeNonZeros(); ret2.examSparsity(); } //System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); } public static void matrixMultPermute(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) throws DMLRuntimeException { //check inputs / outputs if (pm1.isEmptyBlock(false) || m2.isEmptyBlock(false)) return; //check no parallelization benefit (fallback to sequential) if (pm1.rlen == 1) { matrixMultPermute(pm1, m2, ret1, ret2); return; } //Timing time = new Timing(true); //allocate first output block (second allocated if needed) ret1.sparse = false; // no need to check isThreadSafe ret1.allocateDenseBlock(); try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultPermuteTask> tasks = new ArrayList<MatrixMultPermuteTask>(); int blklen = (int) (Math.ceil((double) pm1.rlen / k)); for (int i = 0; i < k & i * blklen < pm1.rlen; i++) tasks.add(new MatrixMultPermuteTask(pm1, m2, ret1, ret2, i * blklen, Math.min((i + 1) * blklen, pm1.rlen))); pool.invokeAll(tasks); pool.shutdown(); } catch (InterruptedException e) { throw new DMLRuntimeException(e); } //post-processing ret1.recomputeNonZeros(); ret1.examSparsity(); if (ret2 != null) { //optional second output ret2.recomputeNonZeros(); ret2.examSparsity(); } // System.out.println("PMM Par ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); } public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt) throws DMLRuntimeException { //check for empty result if (wt == WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightsType.POST_NZ && mX.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //core weighted square sum mm computation if (!mX.sparse && !mU.sparse && !mV.sparse && (mW == null || !mW.sparse) && !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW == null || !mW.isEmptyBlock())) matrixMultWSLossDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen); else if (mX.sparse && !mU.sparse && !mV.sparse && (mW == null || mW.sparse) && !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW == null || !mW.isEmptyBlock())) matrixMultWSLossSparseDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen); else matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen); //System.out.println("MMWSLoss " +wt.toString()+ " ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int k) throws DMLRuntimeException { //check for empty result if (wt == WeightsType.POST && mW.isEmptyBlock(false) || wt == WeightsType.POST_NZ && mX.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //check no parallelization benefit (fallback to sequential) //no need to check isThreadSafe (scalar output) if (mX.rlen == 1) { matrixMultWSLoss(mX, mU, mV, mW, ret, wt); return; } //Timing time = new Timing(true); try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultWSLossTask> tasks = new ArrayList<MatrixMultWSLossTask>(); int blklen = (int) (Math.ceil((double) mX.rlen / k)); for (int i = 0; i < k & i * blklen < mX.rlen; i++) tasks.add(new MatrixMultWSLossTask(mX, mU, mV, mW, wt, i * blklen, Math.min((i + 1) * blklen, mX.rlen))); List<Future<Double>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial results sumScalarResults(taskret, ret); } catch (Exception e) { throw new DMLRuntimeException(e); } //System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing ret.sparse = mW.sparse; ret.allocateDenseOrSparseBlock(); //core weighted square sum mm computation if (!mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen); else if (mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen); else matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen); //post-processing ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //check no parallelization benefit (fallback to sequential) if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) { matrixMultWSigmoid(mW, mU, mV, ret, wt); return; } //Timing time = new Timing(true); //pre-processing ret.sparse = mW.sparse; ret.allocateDenseOrSparseBlock(); try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<MatrixMultWSigmoidTask>(); int blklen = (int) (Math.ceil((double) mW.rlen / k)); for (int i = 0; i < k & i * blklen < mW.rlen; i++) tasks.add(new MatrixMultWSigmoidTask(mW, mU, mV, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen))); //execute tasks List<Future<Long>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial nnz and check for errors ret.nonZeros = 0; //reset after execute for (Future<Long> task : taskret) ret.nonZeros += task.get(); } catch (Exception e) { throw new DMLRuntimeException(e); } //post-processing (nnz maintained in parallel) ret.examSparsity(); //System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + "."); } /** * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) * is zero. * * @param mW matrix W * @param mU matrix U * @param mV matrix V * @param mX matrix X * @param ret result type * @param wt weighted divide matrix multiplication type * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false) || (wt.isLeft() && mU.isEmptyBlock(false)) || (wt.isRight() && mV.isEmptyBlock(false)) || (wt.isBasic() && mW.isEmptyBlock(false))) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing ret.sparse = wt.isBasic() ? mW.sparse : false; ret.allocateDenseOrSparseBlock(); //core weighted div mm computation boolean scalarX = wt.hasScalar(); if (!mW.sparse && !mU.sparse && !mV.sparse && (mX == null || !mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); else if (mW.sparse && !mU.sparse && !mV.sparse && (mX == null || mX.sparse || scalarX) && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); else matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen); //post-processing ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } /** * NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations * have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail, * this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero * which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X) * is zero. * * @param mW matrix W * @param mU matrix U * @param mV matrix V * @param mX matrix X * @param ret result matrix * @param wt weighted divide matrix multiplication type * @param k maximum parallelism * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false) || (wt.isLeft() && mU.isEmptyBlock(false)) || (wt.isRight() && mV.isEmptyBlock(false)) || (wt.isBasic() && mW.isEmptyBlock(false))) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing ret.sparse = wt.isBasic() ? mW.sparse : false; ret.allocateDenseOrSparseBlock(); if (!ret.isThreadSafe()) { matrixMultWDivMM(mW, mU, mV, mX, ret, wt); return; } try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>(); //create tasks (for wdivmm-left, parallelization over columns; //for wdivmm-right, parallelization over rows; both ensure disjoint results) if (wt.isLeft()) { int blklen = (int) (Math.ceil((double) mW.clen / k)); for (int j = 0; j < k & j * blklen < mW.clen; j++) tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, 0, mW.rlen, j * blklen, Math.min((j + 1) * blklen, mW.clen))); } else { //basic/right int blklen = (int) (Math.ceil((double) mW.rlen / k)); for (int i = 0; i < k & i * blklen < mW.rlen; i++) tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen), 0, mW.clen)); } //execute tasks List<Future<Long>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial nnz and check for errors ret.nonZeros = 0; //reset after execute for (Future<Long> task : taskret) ret.nonZeros += task.get(); } catch (Exception e) { throw new DMLRuntimeException(e); } //post-processing ret.examSparsity(); //System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing ret.sparse = false; ret.allocateDenseBlock(); //core weighted cross entropy mm computation if (!mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWCeMMDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen); else if (mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWCeMMSparseDense(mW, mU, mV, eps, ret, wt, 0, mW.rlen); else matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen); //System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int k) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing (no need to check isThreadSafe) ret.sparse = false; ret.allocateDenseBlock(); try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultWCeTask> tasks = new ArrayList<MatrixMultWCeTask>(); int blklen = (int) (Math.ceil((double) mW.rlen / k)); for (int i = 0; i < k & i * blklen < mW.rlen; i++) tasks.add(new MatrixMultWCeTask(mW, mU, mV, eps, wt, i * blklen, Math.min((i + 1) * blklen, mW.rlen))); List<Future<Double>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial results sumScalarResults(taskret, ret); } catch (Exception e) { throw new DMLRuntimeException(e); } //System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //Timing time = new Timing(true); //pre-processing ret.sparse = mW.sparse; ret.allocateDenseOrSparseBlock(); //core weighted square sum mm computation if (!mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWuMMDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen); else if (mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock()) matrixMultWuMMSparseDense(mW, mU, mV, ret, wt, fn, 0, mW.rlen); else matrixMultWuMMGeneric(mW, mU, mV, ret, wt, fn, 0, mW.rlen); //post-processing ret.recomputeNonZeros(); ret.examSparsity(); //System.out.println("MMWu "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop()); } public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int k) throws DMLRuntimeException { //check for empty result if (mW.isEmptyBlock(false)) { ret.examSparsity(); //turn empty dense into sparse return; } //check no parallelization benefit (fallback to sequential) if (mW.rlen == 1 || !MatrixBlock.isThreadSafe(mW.sparse)) { matrixMultWuMM(mW, mU, mV, ret, wt, fn); return; } //Timing time = new Timing(true); //pre-processing ret.sparse = mW.sparse; ret.allocateDenseOrSparseBlock(); try { ExecutorService pool = Executors.newFixedThreadPool(k); ArrayList<MatrixMultWuTask> tasks = new ArrayList<MatrixMultWuTask>(); int blklen = (int) (Math.ceil((double) mW.rlen / k)); for (int i = 0; i < k & i * blklen < mW.rlen; i++) tasks.add(new MatrixMultWuTask(mW, mU, mV, ret, wt, fn, i * blklen, Math.min((i + 1) * blklen, mW.rlen))); //execute tasks List<Future<Long>> taskret = pool.invokeAll(tasks); pool.shutdown(); //aggregate partial nnz and check for errors ret.nonZeros = 0; //reset after execute for (Future<Long> task : taskret) ret.nonZeros += task.get(); } catch (Exception e) { throw new DMLRuntimeException(e); } //post-processing (nnz maintained in parallel) ret.examSparsity(); //System.out.println("MMWu "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" + // "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + "."); } ////////////////////////////////////////// // optimized matrix mult implementation // ////////////////////////////////////////// private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) throws DMLRuntimeException { double[] a = m1.denseBlock; double[] b = m2.denseBlock; double[] c = ret.denseBlock; final int m = m1.rlen; final int n = m2.clen; final int cd = m1.clen; if (LOW_LEVEL_OPTIMIZATION) { if (m == 1 && n == 1) //DOT PRODUCT { c[0] = dotProduct(a, b, cd); } else if (n > 1 && cd == 1) //OUTER PRODUCT { for (int i = rl, cix = rl * n; i < ru; i++, cix += n) { if (a[i] == 1) System.arraycopy(b, 0, c, cix, n); else if (a[i] != 0) vectMultiplyWrite(a[i], b, c, 0, cix, n); else Arrays.fill(c, cix, cix + n, 0); } } else if (n == 1 && cd == 1) //VECTOR-SCALAR { vectMultiplyWrite(b[0], a, c, rl, rl, ru - rl); } else if (n == 1 && cd <= 2 * 1024) //MATRIX-VECTOR (short rhs) { for (int i = rl, aix = rl * cd; i < ru; i++, aix += cd) c[i] = dotProduct(a, b, aix, 0, cd); } else if (n == 1) //MATRIX-VECTOR (tall rhs) { final int blocksizeI = 32; final int blocksizeK = 2 * 1024; //16KB vector blocks (L1) for (int bi = rl; bi < ru; bi += blocksizeI) { int bimin = Math.min(bi + blocksizeI, ru); for (int bk = 0; bk < cd; bk += blocksizeK) { int bkmin = Math.min(bk + blocksizeK, cd); for (int i = bi, aix = bi * cd + bk; i < bimin; i++, aix += cd) c[i] += dotProduct(a, b, aix, bk, bkmin - bk); } } } else if (pm2 && m == 1) //VECTOR-MATRIX { //parallelization over rows in rhs matrix //rest not aligned to blocks of 2 rows final int kn = (ru - rl) % 2; if (kn == 1 && a[rl] != 0) vectMultiplyAdd(a[rl], b, c, rl * n, 0, n); //compute blocks of 2 rows (2 instead of 4 for small n<64) for (int k = rl + kn, bix = (rl + kn) * n; k < ru; k += 2, bix += 2 * n) { if (a[k] != 0 && a[k + 1] != 0) vectMultiplyAdd2(a[k], a[k + 1], b, c, bix, bix + n, 0, n); else if (a[k] != 0) vectMultiplyAdd(a[k], b, c, bix, 0, n); else if (a[k + 1] != 0) vectMultiplyAdd(a[k + 1], b, c, bix + n, 0, n); } } else if (pm2 && m <= 16) //MATRIX-MATRIX (short lhs) { //cache-conscious parallelization over rows in rhs matrix final int kn = (ru - rl) % 4; //rest not aligned to blocks of 2 rows for (int i = 0, aix = 0, cix = 0; i < m; i++, aix += cd, cix += n) for (int k = rl, bix = rl * n; k < rl + kn; k++, bix += n) if (a[aix + k] != 0) vectMultiplyAdd(a[aix + k], b, c, bix, cix, n); final int blocksizeK = 48; final int blocksizeJ = 1024; //blocked execution for (int bk = rl + kn; bk < ru; bk += blocksizeK) for (int bj = 0, bkmin = Math.min(cd, bk + blocksizeK); bj < n; bj += blocksizeJ) { //compute blocks of 4 rows in rhs w/ IKJ int bjlen = Math.min(n, bj + blocksizeJ) - bj; for (int i = 0, aix = 0, cix = bj; i < m; i++, aix += cd, cix += n) for (int k = bk, bix = bk * n; k < bkmin; k += 4, bix += 4 * n) { vectMultiplyAdd4(a[aix + k], a[aix + k + 1], a[aix + k + 2], a[aix + k + 3], b, c, bix, bix + n, bix + 2 * n, bix + 3 * n, cix, bjlen); } } } else if (tm2) //MATRIX-MATRIX (skinny rhs) { //note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64 //however, explicit flag required since dimension change m2 final int n2 = m2.rlen; for (int i = rl, aix = rl * cd, cix = rl * n2; i < ru; i++, aix += cd, cix += n2) for (int j = 0, bix = 0; j < n2; j++, bix += cd) c[cix + j] = dotProduct(a, b, aix, bix, cd); } else //MATRIX-MATRIX { //1) Unrolled inner loop (for better instruction-level parallelism) //2) Blocked execution (for less cache trashing in parallel exec) //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2) final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan //temporary arrays (nnz a, b index) double[] ta = new double[blocksizeK]; int[] tbi = new int[blocksizeK]; //blocked execution for (int bi = rl; bi < ru; bi += blocksizeI) for (int bk = 0, bimin = Math.min(ru, bi + blocksizeI); bk < cd; bk += blocksizeK) for (int bj = cl, bkmin = Math.min(cd, bk + blocksizeK); bj < cu; bj += blocksizeJ) { int bklen = bkmin - bk; int bjlen = Math.min(cu, bj + blocksizeJ) - bj; //core sub block matrix multiplication for (int i = bi; i < bimin; i++) { int aixi = i * cd + bk; //start index on a int cixj = i * n + bj; //scan index on c //determine nnz of a (for sparsity-aware skipping of rows) int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen); //if( knnz > 0 ) //for skipping empty rows //rest not aligned to blocks of 4 rows final int bn = knnz % 4; switch (bn) { case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break; case 2: vectMultiplyAdd2(ta[0], ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break; case 3: vectMultiplyAdd3(ta[0], ta[1], ta[2], b, c, tbi[0], tbi[1], tbi[2], cixj, bjlen); break; } //compute blocks of 4 rows (core inner loop) for (int k = bn; k < knnz; k += 4) { vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], b, c, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen); } } } } } else { double val; for (int i = rl, aix = rl * cd, cix = rl * n; i < ru; i++, cix += n) for (int k = 0, bix = 0; k < cd; k++, aix++, bix += n) { val = a[aix]; if (val != 0) for (int j = 0; j < n; j++) c[cix + j] += val * b[bix + j]; } } } private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException { double[] a = m1.denseBlock; double[] c = ret.denseBlock; int m = m1.rlen; int cd = m1.clen; int n = m2.clen; // MATRIX-MATRIX (VV, MV not applicable here because V always dense) if (LOW_LEVEL_OPTIMIZATION) { final int blocksizeI = 32; //256KB c block (typical L2 size per core), 32KB a block final int blocksizeK = 32; //note: in contrast to dense-dense, no blocking over j (would require maintaining blocksizeK indexes, counter-productive on skew) SparseBlock b = m2.sparseBlock; if (pm2 && m == 1) //VECTOR-MATRIX { //parallelization over rows in rhs matrix for (int k = rl; k < ru; k++) if (a[k] != 0 && !b.isEmpty(k)) { vectMultiplyAdd(a[k], b.values(k), c, b.indexes(k), b.pos(k), 0, b.size(k)); } } else //MATRIX-MATRIX { //blocked execution for (int bi = rl; bi < ru; bi += blocksizeI) for (int bk = 0, bimin = Math.min(ru, bi + blocksizeI); bk < cd; bk += blocksizeK) { int bklen = Math.min(cd, bk + blocksizeK) - bk; //core sub block matrix multiplication for (int i = bi; i < bimin; i++) { int aixi = i * cd + bk; //start index on a int cixj = i * n + 0; //scan index on c for (int k = 0; k < bklen; k++) { double val = a[aixi + k]; if (val != 0 && !b.isEmpty(bk + k)) { vectMultiplyAdd(val, b.values(bk + k), c, b.indexes(bk + k), b.pos(bk + k), cixj, b.size(bk + k)); } } } } } } else { SparseBlock b = m2.sparseBlock; for (int i = rl, aix = rl * cd, cix = rl * n; i < ru; i++, cix += n) for (int k = 0; k < cd; k++, aix++) { double val = a[aix]; if (val != 0) { if (!b.isEmpty(k)) { int bpos = b.pos(k); int blen = b.size(k); int[] bix = b.indexes(k); double[] bvals = b.values(k); for (int j = bpos; j < bpos + blen; j++) c[cix + bix[j]] += val * bvals[j]; } } } } } private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException { double[] b = m2.denseBlock; double[] c = ret.denseBlock; final int m = m1.rlen; final int n = m2.clen; final int cd = m2.rlen; final long xsp = (long) m * cd / m1.nonZeros; if (LOW_LEVEL_OPTIMIZATION) { SparseBlock a = m1.sparseBlock; if (m == 1 && n == 1) //DOT PRODUCT { if (!a.isEmpty(0)) { c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0)); } } else if (n == 1 && cd <= 2 * 1024) //MATRIX-VECTOR (short rhs) { for (int i = rl; i < ru; i++) if (!a.isEmpty(i)) c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i)); } else if (n == 1) //MATRIX-VECTOR (tall rhs) { final int blocksizeI = 32; final int blocksizeK = (int) Math.max(2 * 1024, 2 * 1024 * xsp / 32); //~ 16KB L1 int[] curk = new int[blocksizeI]; for (int bi = rl; bi < ru; bi += blocksizeI) { Arrays.fill(curk, 0); //reset positions for (int bk = 0, bimin = Math.min(ru, bi + blocksizeI); bk < cd; bk += blocksizeK) { for (int i = bi, bkmin = Math.min(bk + blocksizeK, cd); i < bimin; i++) { if (a.isEmpty(i)) continue; int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); int k = curk[i - bi] + apos; for (; k < apos + alen && aix[k] < bkmin; k++) c[i] += avals[k] * b[aix[k]]; curk[i - bi] = k - apos; } } } } else if (pm2 && m == 1) //VECTOR-MATRIX { //parallelization over rows in rhs matrix if (!a.isEmpty(0)) { int alen = a.size(0); int[] aix = a.indexes(0); double[] avals = a.values(0); int rlix = (rl == 0) ? 0 : a.posFIndexGTE(0, rl); rlix = (rlix >= 0) ? rlix : alen; for (int k = rlix; k < alen && aix[k] < ru; k++) { if (k + 1 < alen && aix[k + 1] < ru) vectMultiplyAdd2(avals[k], avals[k + 1], b, c, aix[k] * n, aix[++k] * n, 0, n); else vectMultiplyAdd(avals[k], b, c, aix[k] * n, 0, n); } } } else if (pm2 && m <= 16) //MATRIX-MATRIX (short lhs) { int arlen = a.numRows(); for (int i = 0, cix = 0; i < arlen; i++, cix += n) if (!a.isEmpty(i)) { int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); int k1 = (rl == 0) ? apos : a.posFIndexGTE(i, rl); k1 = (k1 >= 0) ? k1 : apos + alen; int k2 = (ru == cd) ? apos + alen : a.posFIndexGTE(i, ru); k2 = (k2 >= 0) ? k2 : apos + alen; //rest not aligned to blocks of 4 rows final int bn = (k2 - k1) % 4; switch (bn) { case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1] * n, cix, n); break; case 2: vectMultiplyAdd2(avals[k1], avals[k1 + 1], b, c, aix[k1] * n, aix[k1 + 1] * n, cix, n); break; case 3: vectMultiplyAdd3(avals[k1], avals[k1 + 1], avals[k1 + 2], b, c, aix[k1] * n, aix[k1 + 1] * n, aix[k1 + 2] * n, cix, n); break; } //compute blocks of 4 rows (core inner loop) for (int k = k1 + bn; k < k2; k += 4) { vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n, aix[k + 1] * n, aix[k + 2] * n, aix[k + 3] * n, cix, n); } } } else if (n <= 64) //MATRIX-MATRIX (skinny rhs) { //no blocking since b and c fit into cache anyway for (int i = rl, cix = rl * n; i < ru; i++, cix += n) { if (a.isEmpty(i)) continue; int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); //rest not aligned to blocks of 4 rows int bn = alen % 4; for (int k = apos; k < apos + bn; k++) vectMultiplyAdd(avals[k], b, c, aix[k] * n, cix, n); //compute blocks of 4 rows (core inner loop) for (int k = apos + bn; k < apos + alen; k += 4) vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n, aix[k + 1] * n, aix[k + 2] * n, aix[k + 3] * n, cix, n); } } else //MATRIX-MATRIX { //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j final int blocksizeI = 32; final int blocksizeK = 24; final int blocksizeJ = 1024; //temporary array of current sparse positions int[] curk = new int[blocksizeI]; //blocked execution over IKJ for (int bi = rl; bi < ru; bi += blocksizeI) { Arrays.fill(curk, 0); //reset positions for (int bk = 0, bimin = Math.min(ru, bi + blocksizeI); bk < cd; bk += blocksizeK) { for (int bj = 0, bkmin = Math.min(cd, bk + blocksizeK); bj < n; bj += blocksizeJ) { int bjlen = Math.min(n, bj + blocksizeJ) - bj; //core sub block matrix multiplication for (int i = bi, cix = bi * n + bj; i < bimin; i++, cix += n) { if (!a.isEmpty(i)) { int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); int k = curk[i - bi] + apos; //rest not aligned to blocks of 4 rows int bn = alen % 4; for (; k < apos + bn && aix[k] < bkmin; k++) vectMultiplyAdd(avals[k], b, c, aix[k] * n + bj, cix, bjlen); //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin for (; k < apos + alen && aix[k] < bkmin; k += 4) vectMultiplyAdd4(avals[k], avals[k + 1], avals[k + 2], avals[k + 3], b, c, aix[k] * n + bj, aix[k + 1] * n + bj, aix[k + 2] * n + bj, aix[k + 3] * n + bj, cix, bjlen); //update positions on last bj block if (bj + bjlen == n) curk[i - bi] = k - apos; } } } } } } } else { SparseBlock a = m1.sparseBlock; for (int i = rl, cix = rl * n; i < ru; i++, cix += n) { if (!a.isEmpty(i)) { int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); for (int k = apos; k < apos + alen; k++) { double val = avals[k]; for (int j = 0, bix = aix[k] * n; j < n; j++) c[cix + j] += val * b[bix + j]; } } } } } private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) throws DMLRuntimeException { SparseBlock a = m1.sparseBlock; SparseBlock b = m2.sparseBlock; double[] c = ret.denseBlock; int m = m1.rlen; int cd = m1.clen; int n = m2.clen; // MATRIX-MATRIX (VV, MV not applicable here because V always dense) if (LOW_LEVEL_OPTIMIZATION) { if (pm2 && m == 1) //VECTOR-MATRIX { //parallelization over rows in rhs matrix if (!a.isEmpty(0)) { int alen = a.size(0); int[] aix = a.indexes(0); double[] avals = a.values(0); int rlix = (rl == 0) ? 0 : a.posFIndexGTE(0, rl); rlix = (rlix >= 0) ? rlix : alen; for (int k = rlix; k < alen && aix[k] < ru; k++) if (!b.isEmpty(aix[k])) { int bpos = b.pos(aix[k]); int blen = b.size(aix[k]); int[] bix = b.indexes(aix[k]); double[] bvals = b.values(aix[k]); vectMultiplyAdd(avals[k], bvals, c, bix, bpos, 0, blen); } } } else //MATRIX-MATRIX { //block sizes for best-effort blocking w/ sufficient row reuse in B yet small overhead final int blocksizeI = 32; final int blocksizeK = (int) Math.max(32, UtilFunctions.nextIntPow2((int) Math.pow((double) m * cd / m1.nonZeros, 2))); //temporary array of current sparse positions int[] curk = new int[blocksizeI]; //blocked execution over IK for (int bi = rl; bi < ru; bi += blocksizeI) { Arrays.fill(curk, 0); //reset positions for (int bk = 0, bimin = Math.min(ru, bi + blocksizeI); bk < cd; bk += blocksizeK) { final int bkmin = Math.min(cd, bk + blocksizeK); //core sub block matrix multiplication for (int i = bi, cix = bi * n; i < bimin; i++, cix += n) { if (!a.isEmpty(i)) { final int apos = a.pos(i); final int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); int k = curk[i - bi] + apos; for (; k < apos + alen && aix[k] < bkmin; k++) { if (!b.isEmpty(aix[k])) vectMultiplyAdd(avals[k], b.values(aix[k]), c, b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k])); } curk[i - bi] = k - apos; } } } } } } else { for (int i = rl, cix = rl * n; i < Math.min(ru, a.numRows()); i++, cix += n) { if (!a.isEmpty(i)) { int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); for (int k = apos; k < apos + alen; k++) { double val = avals[k]; if (!b.isEmpty(aix[k])) { int bpos = b.pos(aix[k]); int blen = b.size(aix[k]); int[] bix = b.indexes(aix[k]); double[] bvals = b.values(aix[k]); for (int j = bpos; j < bpos + blen; j++) c[cix + bix[j]] += val * bvals[j]; } } } } } } /** * This implementation applies to any combination of dense/sparse if at least one * input is ultrasparse (sparse and very few nnz). In that case, most importantly, * we want to create a sparse output and only iterate over the few nnz as the major * dimension. Low-level optimization have less importance in that case and having * this generic implementation helps to reduce the implementations from (2+1)^2 * to 2^2+1. * * @param m1 first matrix * @param m2 second matrix * @param ret result matrix * @param rl row lower bound * @param ru row upper bound * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) throws DMLRuntimeException { //TODO perf sparse block, consider iterators boolean leftUS = m1.isUltraSparse(); final int m = m1.rlen; final int cd = m1.clen; final int n = m2.clen; if (leftUS) //left is ultra-sparse (IKJ) { SparseBlock a = m1.sparseBlock; boolean rightSparse = m2.sparse; for (int i = rl; i < ru; i++) { if (!a.isEmpty(i)) { int apos = a.pos(i); int alen = a.size(i); int[] aixs = a.indexes(i); double[] avals = a.values(i); if (alen == 1 && avals[apos] == 1) //ROW SELECTION (no aggregation) { int aix = aixs[apos]; if (rightSparse) { //sparse right matrix (full row copy) if (!m2.sparseBlock.isEmpty(aix)) { ret.rlen = m; ret.allocateSparseRowsBlock(false); //allocation on demand ret.sparseBlock.set(i, m2.sparseBlock.get(aix), true); ret.nonZeros += ret.sparseBlock.size(i); } } else { //dense right matrix (append all values) for (int j = 0; j < n; j++) ret.appendValue(i, j, m2.quickGetValue(aix, j)); } } else //GENERAL CASE { for (int k = apos; k < apos + alen; k++) { double aval = avals[k]; int aix = aixs[k]; for (int j = 0; j < n; j++) { double cval = ret.quickGetValue(i, j); double cvald = aval * m2.quickGetValue(aix, j); if (cvald != 0) ret.quickSetValue(i, j, cval + cvald); } } } } } } else //right is ultra-sparse (KJI) { SparseBlock b = m2.sparseBlock; for (int k = 0; k < cd; k++) { if (!b.isEmpty(k)) { int bpos = b.pos(k); int blen = b.size(k); int[] bixs = b.indexes(k); double[] bvals = b.values(k); for (int j = bpos; j < bpos + blen; j++) { double bval = bvals[j]; int bix = bixs[j]; for (int i = rl; i < ru; i++) { double cvald = bval * m1.quickGetValue(i, k); if (cvald != 0) { double cval = ret.quickGetValue(i, bix); ret.quickSetValue(i, bix, cval + cvald); } } } } } } //no need to recompute nonzeros because maintained internally } private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) { double[] a = mX.denseBlock; double[] b = mV.denseBlock; double[] w = (mW != null) ? mW.denseBlock : null; double[] c = ret.denseBlock; final int cd = mX.clen; //features in X boolean weights = (ct == ChainType.XtwXv); boolean weights2 = (ct == ChainType.XtXvy); //temporary array for cache blocking //(blocksize chosen to fit b+v in L2 (256KB) for default 1k blocks) final int blocksize = 24; // constraint: factor of 4 double[] tmp = new double[blocksize]; //blockwise mmchain computation final int bn = ru - ru % blocksize; //rl blocksize aligned for (int bi = rl; bi < bn; bi += blocksize) { //compute 1st matrix-vector for row block for (int j = 0, aix = bi * cd; j < blocksize; j++, aix += cd) tmp[j] = dotProduct(a, b, aix, 0, cd); //multiply/subtract weights (in-place), if required if (weights) vectMultiply(w, tmp, bi, 0, blocksize); else if (weights2) vectSubtract(w, tmp, bi, 0, blocksize); //compute 2nd matrix vector for row block and aggregate for (int j = 0, aix = bi * cd; j < blocksize; j += 4, aix += 4 * cd) vectMultiplyAdd4(tmp[j], tmp[j + 1], tmp[j + 2], tmp[j + 3], a, c, aix, aix + cd, aix + 2 * cd, aix + 3 * cd, 0, cd); } //compute rest (not aligned to blocksize) for (int i = bn, aix = bn * cd; i < ru; i++, aix += cd) { double val = dotProduct(a, b, aix, 0, cd); val *= (weights) ? w[i] : 1; val -= (weights2) ? w[i] : 0; vectMultiplyAdd(val, a, c, aix, 0, cd); } } private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) { SparseBlock a = mX.sparseBlock; double[] b = mV.denseBlock; double[] w = (mW != null) ? mW.denseBlock : null; double[] c = ret.denseBlock; boolean weights = (ct == ChainType.XtwXv); boolean weights2 = (ct == ChainType.XtXvy); //temporary array for cache blocking //(blocksize chosen to fit b+v in L2 (256KB) for default 1k blocks) final int blocksize = 24; double[] tmp = new double[blocksize]; //blockwise mmchain computation for (int bi = rl; bi < ru; bi += blocksize) { //reset row block intermediate int tmplen = Math.min(blocksize, ru - bi); //compute 1st matrix-vector for row block for (int j = 0; j < tmplen; j++) { if (!a.isEmpty(bi + j)) { int apos = a.pos(bi + j); int alen = a.size(bi + j); tmp[j] = dotProduct(a.values(bi + j), b, a.indexes(bi + j), apos, 0, alen); } } //multiply weights (in-place), if required if (weights) vectMultiply(w, tmp, bi, 0, tmplen); else if (weights2) vectSubtract(w, tmp, bi, 0, tmplen); //compute 2nd matrix vector for row block and aggregate for (int j = 0; j < tmplen; j++) { if (!a.isEmpty(bi + j) && tmp[j] != 0) { int apos = a.pos(bi + j); int alen = a.size(bi + j); int[] aix = a.indexes(bi + j); double[] avals = a.values(bi + j); vectMultiplyAdd(tmp[j], avals, c, aix, apos, 0, alen); } } } } private static void matrixMultTransposeSelfDense(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) throws DMLRuntimeException { //2) transpose self matrix multiply dense // (compute only upper-triangular matrix due to symmetry) double[] a = m1.denseBlock; double[] c = ret.denseBlock; int m = m1.rlen; int n = m1.clen; if (leftTranspose) // t(X)%*%X { if (LOW_LEVEL_OPTIMIZATION) { if (n == 1) //VECTOR (col) { c[0] = dotProduct(a, a, m); } else //MATRIX { //1) Unrolled inner loop (for better instruction-level parallelism) //2) Blocked execution (for less cache trashing in parallel exec) //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2) final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan //temporary arrays (nnz a, b index) double[] ta = new double[blocksizeK]; int[] tbi = new int[blocksizeK]; final int mx = ru; final int cdx = m; final int nx = n; //blocked execution for (int bi = rl; bi < mx; bi += blocksizeI) //from bi due to symmetry for (int bk = 0, bimin = Math.min(mx, bi + blocksizeI); bk < cdx; bk += blocksizeK) for (int bj = bi, bkmin = Math.min(cdx, bk + blocksizeK); bj < nx; bj += blocksizeJ) { int bklen = bkmin - bk; int bjlen = Math.min(nx, bj + blocksizeJ) - bj; //core sub block matrix multiplication for (int i = bi; i < bimin; i++) { int aixi = bk * n + i; //start index on a (logical t(X)) int cixj = i * nx + bj; //scan index on c //determine nnz of a (for sparsity-aware skipping of rows) int knnz = copyNonZeroElements(a, aixi, bk, bj, n, nx, ta, tbi, bklen); //rest not aligned to blocks of 4 rows final int bn = knnz % 4; switch (bn) { case 1: vectMultiplyAdd(ta[0], a, c, tbi[0], cixj, bjlen); break; case 2: vectMultiplyAdd2(ta[0], ta[1], a, c, tbi[0], tbi[1], cixj, bjlen); break; case 3: vectMultiplyAdd3(ta[0], ta[1], ta[2], a, c, tbi[0], tbi[1], tbi[2], cixj, bjlen); break; } //compute blocks of 4 rows (core inner loop) for (int k = bn; k < knnz; k += 4) { vectMultiplyAdd4(ta[k], ta[k + 1], ta[k + 2], ta[k + 3], a, c, tbi[k], tbi[k + 1], tbi[k + 2], tbi[k + 3], cixj, bjlen); } } } } } else { for (int k = 0, ix1 = 0; k < m; k++, ix1 += n) for (int i = rl, ix3 = 0; i < ru; i++, ix3 += n) { double val = a[ix1 + i]; if (val != 0) { for (int j = i; j < n; j++) //from i due to symmetry c[ix3 + j] += val * a[ix1 + j]; } } } } else // X%*%t(X) { if (LOW_LEVEL_OPTIMIZATION) { if (m == 1) //VECTOR { c[0] = dotProduct(a, a, n); } else //MATRIX { //algorithm: scan c, foreach ci,j: scan row of a and t(a) (IJK) //1) Unrolled inner loop, for better ILP //2) Blocked execution, for less cache trashing in parallel exec // (smaller block sizes would be slightly better, but consistent as is) //3) Single write in inner loop (transient intermediates) int blocksize = 64; for (int bi = rl; bi < ru; bi += blocksize) for (int bj = bi; bj < m; bj += blocksize) { final int bimin = Math.min(ru, bi + blocksize); final int bjmin = Math.min(m, bj + blocksize); for (int i = bi, ix1 = bi * n, ix3 = bi * m; i < bimin; i++, ix1 += n, ix3 += m) { final int bjmax = Math.max(i, bj); for (int j = bjmax, ix2 = bjmax * n; j < bjmin; j++, ix2 += n) //from i due to symmetry { c[ix3 + j] = dotProduct(a, a, ix1, ix2, n); } } } } } else { for (int i = rl, ix1 = 0, ix3 = 0; i < ru; i++, ix1 += n, ix3 += m) for (int j = i, ix2 = i * n; j < m; j++, ix2 += n) //from i due to symmetry { double val = 0; for (int k = 0; k < n; k++) val += a[ix1 + k] * a[ix2 + k]; c[ix3 + j] = val; } } } } private static void matrixMultTransposeSelfSparse(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru) throws DMLRuntimeException { //2) transpose self matrix multiply sparse // (compute only upper-triangular matrix due to symmetry) SparseBlock a = m1.sparseBlock; double[] c = ret.denseBlock; int m = m1.rlen; int n = m1.clen; if (leftTranspose) // t(X)%*%X { //only general case (because vectors always dense) //algorithm: scan rows, foreach row self join (KIJ) if (LOW_LEVEL_OPTIMIZATION) { int arlen = a.numRows(); for (int r = 0; r < arlen; r++) if (!a.isEmpty(r)) { int apos = a.pos(r); int alen = a.size(r); int[] aix = a.indexes(r); double[] avals = a.values(r); int rlix = (rl == 0) ? apos : a.posFIndexGTE(r, rl); rlix = (rlix >= 0) ? rlix : apos + alen; for (int i = rlix; i < apos + alen && aix[i] < ru; i++) { double val = avals[i]; if (val != 0) { int ix2 = aix[i] * n; vectMultiplyAdd(val, avals, c, aix, i, ix2, alen - i); } } } } else { for (int r = 0; r < a.numRows(); r++) if (!a.isEmpty(r)) { int apos = a.pos(r); int alen = a.size(r); int[] aix = a.indexes(r); double[] avals = a.values(r); int rlix = (rl == 0) ? apos : a.posFIndexGTE(r, rl); rlix = (rlix >= 0) ? rlix : apos + alen; for (int i = rlix; i < apos + alen && aix[i] < ru; i++) { double val = avals[i]; if (val != 0) for (int j = i, ix2 = aix[i] * n; j < apos + alen; j++) c[ix2 + aix[j]] += val * avals[j]; } } } } else // X%*%t(X) { if (m == 1) //VECTOR { if (!m1.sparseBlock.isEmpty(0)) { int alen = m1.sparseBlock.size(0); //pos always 0 double[] avals = a.values(0); c[0] = dotProduct(avals, avals, alen); } } else //MATRIX { //note: reorg to similar layout as t(X)%*%X because faster than //direct computation with IJK (no dependencies/branches in inner loop) //see preprocessMatrixMultTransposeSelf m1<-tmpBlock m = m1.clen; n = m1.rlen; //algorithm: scan rows, foreach row self join (KIJ) if (LOW_LEVEL_OPTIMIZATION) { int arlen = a.numRows(); for (int r = 0; r < arlen; r++) if (!a.isEmpty(r)) { int apos = a.pos(r); int alen = a.size(r); int[] aix = a.indexes(r); double[] avals = a.values(r); int rlix = (rl == 0) ? apos : a.posFIndexGTE(r, rl); rlix = (rlix >= 0) ? rlix : apos + alen; for (int i = rlix; i < apos + alen && aix[i] < ru; i++) { double val = avals[i]; if (val != 0) { int ix2 = aix[i] * m; vectMultiplyAdd(val, avals, c, aix, i, ix2, alen - i); } } } } else { for (int r = 0; r < a.numRows(); r++) if (!a.isEmpty(r)) { int apos = a.pos(r); int alen = a.size(r); int[] aix = a.indexes(r); double[] avals = a.values(r); int rlix = (rl == 0) ? apos : a.posFIndexGTE(r, rl); rlix = (rlix >= 0) ? rlix : apos + alen; for (int i = rlix; i < apos + alen && aix[i] < ru; i++) { double val = avals[i]; if (val != 0) for (int j = i, ix2 = aix[i] * m; j < alen; j++) c[ix2 + aix[j]] += val * avals[j]; } } } } } } private static void matrixMultPermuteDense(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) throws DMLRuntimeException { double[] a = pm1.denseBlock; double[] b = m2.denseBlock; double[] c = ret1.denseBlock; final int n = m2.clen; final int brlen = ret1.getNumRows(); int lastblk = -1; for (int i = rl, bix = rl * n; i < ru; i++, bix += n) { //compute block index and in-block indexes int pos = UtilFunctions.toInt(a[i]); //safe cast if (pos > 0) //selected row { int bpos = (pos - 1) % brlen; int blk = (pos - 1) / brlen; //allocate and switch to second output block //(never happens in cp, correct for multi-threaded usage) if (lastblk != -1 && lastblk < blk) { ret2.sparse = false; ret2.allocateDenseBlock(); c = ret2.denseBlock; } //memcopy entire dense row into target position System.arraycopy(b, bix, c, bpos * n, n); lastblk = blk; } } } private static void matrixMultPermuteDenseSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) { double[] a = pm1.denseBlock; double[] b = m2.denseBlock; SparseBlock c = ret1.sparseBlock; final int n = m2.clen; final int brlen = ret1.getNumRows(); int lastblk = -1; for (int i = rl, bix = rl * n; i < ru; i++, bix += n) { //compute block index and in-block indexes int pos = UtilFunctions.toInt(a[i]); //safe cast if (pos > 0) //selected row { int bpos = (pos - 1) % brlen; int blk = (pos - 1) / brlen; //allocate and switch to second output block //(never happens in cp, correct for multi-threaded usage) if (lastblk != -1 && lastblk < blk) { ret2.sparse = true; ret2.rlen = ret1.rlen; ret2.allocateSparseRowsBlock(); c = ret2.sparseBlock; } //append entire dense row into sparse target position for (int j = 0; j < n; j++) c.append(bpos, j, b[bix + j]); lastblk = blk; } } } private static void matrixMultPermuteSparse(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) { double[] a = pm1.denseBlock; SparseBlock b = m2.sparseBlock; SparseBlock c = ret1.sparseBlock; final int brlen = ret1.getNumRows(); int lastblk = -1; for (int i = rl; i < ru; i++) { //compute block index and in-block indexes int pos = UtilFunctions.toInt(a[i]); //safe cast if (pos > 0) //selected row { int bpos = (pos - 1) % brlen; int blk = (pos - 1) / brlen; //allocate and switch to second output block //(never happens in cp, correct for multi-threaded usage) if (lastblk != -1 && lastblk < blk) { ret2.sparse = true; ret2.allocateSparseRowsBlock(); c = ret2.sparseBlock; } //memcopy entire sparse row into target position c.set(bpos, b.get(i), true); lastblk = blk; } } } private static void matrixMultWSLossDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru) { double[] x = mX.denseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; double[] w = (mW != null) ? mW.denseBlock : null; final int n = mX.clen; final int cd = mU.clen; double wsloss = 0; // approach: iterate over all cells of X //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution for (int bi = rl; bi < ru; bi += blocksizeIJ) for (int bj = 0, bimin = Math.min(ru, bi + blocksizeIJ); bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); // Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting) if (wt == WeightsType.POST) { for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double wij = w[ix + j]; if (wij != 0) { double uvij = dotProduct(u, v, uix, vix, cd); wsloss += wij * (x[ix + j] - uvij) * (x[ix + j] - uvij); //^2 } } } // Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post_nz weighting) else if (wt == WeightsType.POST_NZ) { for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double xij = x[ix + j]; if (xij != 0) { double uvij = dotProduct(u, v, uix, vix, cd); wsloss += (xij - uvij) * (xij - uvij); //^2 } } } // Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting) else if (wt == WeightsType.PRE) { for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double wij = w[ix + j]; double uvij = 0; if (wij != 0) uvij = dotProduct(u, v, uix, vix, cd); wsloss += (x[ix + j] - wij * uvij) * (x[ix + j] - wij * uvij); //^2 } } // Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting) else if (wt == WeightsType.NONE) { for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double uvij = dotProduct(u, v, uix, vix, cd); wsloss += (x[ix + j] - uvij) * (x[ix + j] - uvij); //^2 } } } ret.quickSetValue(0, 0, wsloss); } private static void matrixMultWSLossSparseDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru) { SparseBlock x = mX.sparseBlock; SparseBlock w = (mW != null) ? mW.sparseBlock : null; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int n = mX.clen; final int cd = mU.clen; double wsloss = 0; // Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting) if (wt == WeightsType.POST) { // approach: iterate over W, point-wise in order to exploit sparsity for (int i = rl, uix = rl * cd; i < ru; i++, uix += cd) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); if (w.isAligned(i, x)) { //O(n) where n is nnz in w/x double[] xval = x.values(i); for (int k = wpos; k < wpos + wlen; k++) { double uvij = dotProduct(u, v, uix, wix[k] * cd, cd); wsloss += wval[k] * (xval[k] - uvij) * (xval[k] - uvij); } } else { //O(n log m) where n/m is nnz in w/x for (int k = wpos; k < wpos + wlen; k++) { double xi = mX.quickGetValue(i, wix[k]); double uvij = dotProduct(u, v, uix, wix[k] * cd, cd); wsloss += wval[k] * (xi - uvij) * (xi - uvij); } } } } // Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting) else if (wt == WeightsType.POST_NZ) { // approach: iterate over W, point-wise in order to exploit sparsity // blocked over ij, while maintaining front of column indexes, where the // blocksize is chosen such that we reuse each vector on average 8 times. final int blocksizeIJ = (int) (8L * mX.rlen * mX.clen / mX.nonZeros); int[] curk = new int[blocksizeIJ]; for (int bi = rl; bi < ru; bi += blocksizeIJ) { int bimin = Math.min(ru, bi + blocksizeIJ); //prepare starting indexes for block row Arrays.fill(curk, 0); //blocked execution over column blocks for (int bj = 0; bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); for (int i = bi, uix = bi * cd; i < bimin; i++, uix += cd) { if (!x.isEmpty(i)) { int xpos = x.pos(i); int xlen = x.size(i); int[] xix = x.indexes(i); double[] xval = x.values(i); int k = xpos + curk[i - bi]; for (; k < xpos + xlen && xix[k] < bjmin; k++) { double uvij = dotProduct(u, v, uix, xix[k] * cd, cd); wsloss += (xval[k] - uvij) * (xval[k] - uvij); } curk[i - bi] = k - xpos; } } } } } // Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting) else if (wt == WeightsType.PRE) { // approach: iterate over all cells of X maybe sparse and dense // (note: tuning similar to pattern 3 possible but more complex) for (int i = rl, uix = rl * cd; i < ru; i++, uix += cd) for (int j = 0, vix = 0; j < n; j++, vix += cd) { double xij = mX.quickGetValue(i, j); double wij = mW.quickGetValue(i, j); double uvij = 0; if (wij != 0) uvij = dotProduct(u, v, uix, vix, cd); wsloss += (xij - wij * uvij) * (xij - wij * uvij); } } // Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting) else if (wt == WeightsType.NONE) { // approach: iterate over all cells of X and for (int i = rl, uix = rl * cd; i < ru; i++, uix += cd) { if (x.isEmpty(i)) { //empty row for (int j = 0, vix = 0; j < n; j++, vix += cd) { double uvij = dotProduct(u, v, uix, vix, cd); wsloss += (-uvij) * (-uvij); } } else { //non-empty row int xpos = x.pos(i); int xlen = x.size(i); int[] xix = x.indexes(i); double[] xval = x.values(i); int last = -1; for (int k = xpos; k < xpos + xlen; k++) { //process last nnz til current nnz for (int k2 = last + 1; k2 < xix[k]; k2++) { double uvij = dotProduct(u, v, uix, k2 * cd, cd); wsloss += (-uvij) * (-uvij); } //process current nnz double uvij = dotProduct(u, v, uix, xix[k] * cd, cd); wsloss += (xval[k] - uvij) * (xval[k] - uvij); last = xix[k]; } //process last nnz til end of row for (int k2 = last + 1; k2 < n; k2++) { double uvij = dotProduct(u, v, uix, k2 * cd, cd); wsloss += (-uvij) * (-uvij); } } } } ret.quickSetValue(0, 0, wsloss); } private static void matrixMultWSLossGeneric(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru) { final int n = mX.clen; final int cd = mU.clen; double wsloss = 0; // Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting) if (wt == WeightsType.POST) { // approach: iterate over W, point-wise in order to exploit sparsity if (mW.sparse) //SPARSE { SparseBlock w = mW.sparseBlock; for (int i = rl; i < ru; i++) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); for (int k = wpos; k < wpos + wlen; k++) { double uvij = dotProductGeneric(mU, mV, i, wix[k], cd); double xi = mX.quickGetValue(i, wix[k]); wsloss += wval[k] * (xi - uvij) * (xi - uvij); } } } else //DENSE { double[] w = mW.denseBlock; for (int i = rl, wix = rl * n; i < ru; i++, wix += n) for (int j = 0; j < n; j++) if (w[wix + j] != 0) { double uvij = dotProductGeneric(mU, mV, i, j, cd); double xij = mX.quickGetValue(i, j); wsloss += w[wix + j] * (xij - uvij) * (xij - uvij); } } } // Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting) else if (wt == WeightsType.POST_NZ) { // approach: iterate over W, point-wise in order to exploit sparsity if (mW.sparse) //SPARSE { SparseBlock x = mX.sparseBlock; for (int i = rl; i < ru; i++) if (!x.isEmpty(i)) { int xpos = x.pos(i); int xlen = x.size(i); int[] xix = x.indexes(i); double[] xval = x.values(i); for (int k = xpos; k < xpos + xlen; k++) { double uvij = dotProductGeneric(mU, mV, i, xix[k], cd); wsloss += (xval[k] - uvij) * (xval[k] - uvij); } } } else //DENSE { double[] x = mX.denseBlock; for (int i = rl, xix = rl * n; i < ru; i++, xix += n) for (int j = 0; j < n; j++) if (x[xix + j] != 0) { double uvij = dotProductGeneric(mU, mV, i, j, cd); wsloss += (x[xix + j] - uvij) * (x[xix + j] - uvij); } } } // Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting) else if (wt == WeightsType.PRE) { // approach: iterate over all cells of X maybe sparse and dense for (int i = rl; i < ru; i++) for (int j = 0; j < n; j++) { double xij = mX.quickGetValue(i, j); double wij = mW.quickGetValue(i, j); double uvij = 0; if (wij != 0) uvij = dotProductGeneric(mU, mV, i, j, cd); wsloss += (xij - wij * uvij) * (xij - wij * uvij); } } // Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting) else if (wt == WeightsType.NONE) { // approach: iterate over all cells of X and for (int i = rl; i < ru; i++) for (int j = 0; j < n; j++) { double xij = mX.quickGetValue(i, j); double uvij = dotProductGeneric(mU, mV, i, j, cd); wsloss += (xij - uvij) * (xij - uvij); } } ret.quickSetValue(0, 0, wsloss); } private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) throws DMLRuntimeException { double[] w = mW.denseBlock; double[] c = ret.denseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int n = mW.clen; final int cd = mU.clen; //note: cannot compute U %*% t(V) in-place of result w/ regular mm because //t(V) comes in transformed form and hence would require additional memory boolean flagminus = (wt == WSigmoidType.MINUS || wt == WSigmoidType.LOG_MINUS); boolean flaglog = (wt == WSigmoidType.LOG || wt == WSigmoidType.LOG_MINUS); //approach: iterate over non-zeros of w, selective mm computation //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution for (int bi = rl; bi < ru; bi += blocksizeIJ) for (int bj = 0, bimin = Math.min(ru, bi + blocksizeIJ); bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); //core wsigmoid computation for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double wij = w[ix + j]; if (wij != 0) c[ix + j] = wsigmoid(wij, u, v, uix, vix, flagminus, flaglog, cd); } } } private static void matrixMultWSigmoidSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) throws DMLRuntimeException { SparseBlock w = mW.sparseBlock; SparseBlock c = ret.sparseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int cd = mU.clen; boolean flagminus = (wt == WSigmoidType.MINUS || wt == WSigmoidType.LOG_MINUS); boolean flaglog = (wt == WSigmoidType.LOG || wt == WSigmoidType.LOG_MINUS); //approach: iterate over non-zeros of w, selective mm computation for (int i = rl, uix = rl * cd; i < ru; i++, uix += cd) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); c.allocate(i, wlen); for (int k = wpos; k < wpos + wlen; k++) { double cval = wsigmoid(wval[k], u, v, uix, wix[k] * cd, flagminus, flaglog, cd); c.append(i, wix[k], cval); } } } private static void matrixMultWSigmoidGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) throws DMLRuntimeException { final int n = mW.clen; final int cd = mU.clen; boolean flagminus = (wt == WSigmoidType.MINUS || wt == WSigmoidType.LOG_MINUS); boolean flaglog = (wt == WSigmoidType.LOG || wt == WSigmoidType.LOG_MINUS); //approach: iterate over non-zeros of w, selective mm computation if (mW.sparse) //SPARSE { //w and c always in same representation SparseBlock w = mW.sparseBlock; SparseBlock c = ret.sparseBlock; for (int i = rl; i < ru; i++) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); c.allocate(i, wlen); for (int k = wpos; k < wpos + wlen; k++) { double cval = wsigmoid(wval[k], mU, mV, i, wix[k], flagminus, flaglog, cd); c.append(i, wix[k], cval); } } } else //DENSE { //w and c always in same representation double[] w = mW.denseBlock; double[] c = ret.denseBlock; for (int i = rl, ix = rl * n; i < ru; i++) for (int j = 0; j < n; j++, ix++) { double wij = w[ix]; if (wij != 0) { c[ix] = wsigmoid(wij, mU, mV, i, j, flagminus, flaglog, cd); } } } } private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException { final boolean basic = wt.isBasic(); final boolean left = wt.isLeft(); final boolean mult = wt.isMult(); final boolean minus = wt.isMinus(); final boolean four = wt.hasFourInputs(); final boolean scalar = wt.hasScalar(); final double eps = scalar ? mX.quickGetValue(0, 0) : 0; final int n = mW.clen; final int cd = mU.clen; double[] w = mW.denseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; double[] x = (mX == null) ? null : mX.denseBlock; double[] c = ret.denseBlock; //approach: iterate over non-zeros of w, selective mm computation //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution for (int bi = rl; bi < ru; bi += blocksizeIJ) for (int bj = cl, bimin = Math.min(ru, bi + blocksizeIJ); bj < cu; bj += blocksizeIJ) { int bjmin = Math.min(cu, bj + blocksizeIJ); //core wsigmoid computation for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) if (w[ix + j] != 0) { if (basic) c[ix + j] = w[ix + j] * dotProduct(u, v, uix, vix, cd); else if (four) //left/right if (scalar) wdivmm(w[ix + j], eps, u, v, c, uix, vix, left, scalar, cd); else wdivmm(w[ix + j], x[ix + j], u, v, c, uix, vix, left, scalar, cd); else //left/right minus/default wdivmm(w[ix + j], u, v, c, uix, vix, left, mult, minus, cd); } } } private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException { final boolean basic = wt.isBasic(); final boolean left = wt.isLeft(); final boolean mult = wt.isMult(); final boolean minus = wt.isMinus(); final boolean four = wt.hasFourInputs(); final boolean scalar = wt.hasScalar(); final double eps = scalar ? mX.quickGetValue(0, 0) : 0; final int cd = mU.clen; SparseBlock w = mW.sparseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; double[] c = ret.denseBlock; SparseBlock x = (mX == null) ? null : mX.sparseBlock; //approach: iterate over non-zeros of w, selective mm computation //blocked over ij, while maintaining front of column indexes, where the //blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times, //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output. final int blocksizeI = (int) (8L * mW.rlen * mW.clen / mW.nonZeros); final int blocksizeJ = left ? Math.max(8, Math.min(L2_CACHESIZE / (mU.clen * 8), blocksizeI)) : blocksizeI; int[] curk = new int[blocksizeI]; boolean[] aligned = (four && !scalar) ? new boolean[blocksizeI] : null; //blocked execution over row blocks for (int bi = rl; bi < ru; bi += blocksizeI) { int bimin = Math.min(ru, bi + blocksizeI); //prepare starting indexes for block row for (int i = bi; i < bimin; i++) { int k = (cl == 0 || w.isEmpty(i)) ? 0 : w.posFIndexGTE(i, cl); curk[i - bi] = (k >= 0) ? k : mW.clen; } //prepare alignment info if necessary if (four && !scalar) for (int i = bi; i < bimin; i++) aligned[i - bi] = w.isAligned(i - bi, x); //blocked execution over column blocks for (int bj = cl; bj < cu; bj += blocksizeJ) { int bjmin = Math.min(cu, bj + blocksizeJ); //core wdivmm block matrix mult for (int i = bi, uix = bi * cd; i < bimin; i++, uix += cd) { if (w.isEmpty(i)) continue; int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); int k = wpos + curk[i - bi]; if (basic) { for (; k < wpos + wlen && wix[k] < bjmin; k++) ret.appendValue(i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k] * cd, cd)); } else if (four) { //left/right //checking alignment per row is ok because early abort if false, //row nnz likely fit in L1/L2 cache, and asymptotically better if aligned if (!scalar && w.isAligned(i, x)) { //O(n) where n is nnz in w/x double[] xvals = x.values(i); for (; k < wpos + wlen && wix[k] < bjmin; k++) wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k] * cd, left, scalar, cd); } else { //scalar or O(n log m) where n/m are nnz in w/x for (; k < wpos + wlen && wix[k] < bjmin; k++) if (scalar) wdivmm(wval[k], eps, u, v, c, uix, wix[k] * cd, left, scalar, cd); else wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k] * cd, left, scalar, cd); } } else { //left/right minus default for (; k < wpos + wlen && wix[k] < bjmin; k++) wdivmm(wval[k], u, v, c, uix, wix[k] * cd, left, mult, minus, cd); } curk[i - bi] = k - wpos; } } } } private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException { final boolean basic = wt.isBasic(); final boolean left = wt.isLeft(); final boolean mult = wt.isMult(); final boolean minus = wt.isMinus(); final boolean four = wt.hasFourInputs(); final boolean scalar = wt.hasScalar(); final double eps = scalar ? mX.quickGetValue(0, 0) : 0; final int n = mW.clen; final int cd = mU.clen; //output always in dense representation double[] c = ret.denseBlock; //approach: iterate over non-zeros of w, selective mm computation if (mW.sparse) //SPARSE { SparseBlock w = mW.sparseBlock; for (int i = rl; i < ru; i++) { if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); int k = (cl == 0) ? wpos : w.posFIndexGTE(i, cl); k = (k >= 0) ? k : wpos + wlen; for (; k < wpos + wlen && wix[k] < cu; k++) { if (basic) { double uvij = dotProductGeneric(mU, mV, i, wix[k], cd); ret.appendValue(i, wix[k], uvij); } else if (four) { //left/right double xij = scalar ? eps : mX.quickGetValue(i, wix[k]); wdivmm(wval[k], xij, mU, mV, c, i, wix[k], left, scalar, cd); } else { //left/right minus/default wdivmm(wval[k], mU, mV, c, i, wix[k], left, mult, minus, cd); } } } } } else //DENSE { double[] w = mW.denseBlock; for (int i = rl, ix = rl * n; i < ru; i++, ix += n) for (int j = cl; j < cu; j++) if (w[ix + j] != 0) { if (basic) { c[ix + j] = dotProductGeneric(mU, mV, i, j, cd); } else if (four) { //left/right double xij = scalar ? eps : mX.quickGetValue(i, j); wdivmm(w[ix + j], xij, mU, mV, c, i, j, left, scalar, cd); } else { //left/right minus/default wdivmm(w[ix + j], mU, mV, c, i, j, left, mult, minus, cd); } } } } private static void matrixMultWCeMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru) { double[] w = mW.denseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int n = mW.clen; final int cd = mU.clen; double wceval = 0; // approach: iterate over all cells of X //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution for (int bi = rl; bi < ru; bi += blocksizeIJ) for (int bj = 0, bimin = Math.min(ru, bi + blocksizeIJ); bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double wij = w[ix + j]; if (wij != 0) { double uvij = dotProduct(u, v, uix, vix, cd); wceval += wij * FastMath.log(uvij + eps); } } } ret.quickSetValue(0, 0, wceval); } private static void matrixMultWCeMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru) { SparseBlock w = mW.sparseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int n = mW.clen; final int cd = mU.clen; double wceval = 0; // approach: iterate over W, point-wise in order to exploit sparsity // blocked over ij, while maintaining front of column indexes, where the // blocksize is chosen such that we reuse each vector on average 8 times. final int blocksizeIJ = (int) (8L * mW.rlen * mW.clen / mW.nonZeros); int[] curk = new int[blocksizeIJ]; for (int bi = rl; bi < ru; bi += blocksizeIJ) { int bimin = Math.min(ru, bi + blocksizeIJ); //prepare starting indexes for block row Arrays.fill(curk, 0); //blocked execution over column blocks for (int bj = 0; bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); for (int i = bi, uix = bi * cd; i < bimin; i++, uix += cd) { if (w.isEmpty(i)) continue; int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); int k = wpos + curk[i - bi]; for (; k < wpos + wlen && wix[k] < bjmin; k++) { double uvij = dotProduct(u, v, uix, wix[k] * cd, cd); wceval += wval[k] * FastMath.log(uvij + eps); } curk[i - bi] = k - wpos; } } } ret.quickSetValue(0, 0, wceval); } private static void matrixMultWCeMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int rl, int ru) { final int n = mW.clen; final int cd = mU.clen; double wceval = 0; //approach: iterate over non-zeros of w, selective mm computation if (mW.sparse) //SPARSE { SparseBlock w = mW.sparseBlock; for (int i = rl; i < ru; i++) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); for (int k = wpos; k < wpos + wlen; k++) { double uvij = dotProductGeneric(mU, mV, i, wix[k], cd); wceval += wval[k] * FastMath.log(uvij + eps); } } } else //DENSE { double[] w = mW.denseBlock; for (int i = rl, ix = rl * n; i < ru; i++) for (int j = 0; j < n; j++, ix++) { double wij = w[ix]; if (wij != 0) { double uvij = dotProductGeneric(mU, mV, i, j, cd); wceval += wij * FastMath.log(uvij + eps); } } } ret.quickSetValue(0, 0, wceval); } private static void matrixMultWuMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException { double[] w = mW.denseBlock; double[] c = ret.denseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int n = mW.clen; final int cd = mU.clen; //note: cannot compute U %*% t(V) in-place of result w/ regular mm because //t(V) comes in transformed form and hence would require additional memory boolean flagmult = (wt == WUMMType.MULT); //approach: iterate over non-zeros of w, selective mm computation //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution for (int bi = rl; bi < ru; bi += blocksizeIJ) for (int bj = 0, bimin = Math.min(ru, bi + blocksizeIJ); bj < n; bj += blocksizeIJ) { int bjmin = Math.min(n, bj + blocksizeIJ); //core wsigmoid computation for (int i = bi, ix = bi * n, uix = bi * cd; i < bimin; i++, ix += n, uix += cd) for (int j = bj, vix = bj * cd; j < bjmin; j++, vix += cd) { double wij = w[ix + j]; if (wij != 0) c[ix + j] = wumm(wij, u, v, uix, vix, flagmult, fn, cd); } } } private static void matrixMultWuMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException { SparseBlock w = mW.sparseBlock; SparseBlock c = ret.sparseBlock; double[] u = mU.denseBlock; double[] v = mV.denseBlock; final int cd = mU.clen; boolean flagmult = (wt == WUMMType.MULT); //approach: iterate over non-zeros of w, selective mm computation for (int i = rl, uix = rl * cd; i < ru; i++, uix += cd) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); c.allocate(i, wlen); for (int k = wpos; k < wpos + wlen; k++) { double cval = wumm(wval[k], u, v, uix, wix[k] * cd, flagmult, fn, cd); c.append(i, wix[k], cval); } } } private static void matrixMultWuMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException { final int n = mW.clen; final int cd = mU.clen; boolean flagmult = (wt == WUMMType.MULT); //approach: iterate over non-zeros of w, selective mm computation if (mW.sparse) //SPARSE { //w and c always in same representation SparseBlock w = mW.sparseBlock; SparseBlock c = ret.sparseBlock; for (int i = rl; i < ru; i++) if (!w.isEmpty(i)) { int wpos = w.pos(i); int wlen = w.size(i); int[] wix = w.indexes(i); double[] wval = w.values(i); c.allocate(i, wlen); for (int k = wpos; k < wpos + wlen; k++) { double cval = wumm(wval[k], mU, mV, i, wix[k], flagmult, fn, cd); c.append(i, wix[k], cval); } } } else //DENSE { //w and c always in same representation double[] w = mW.denseBlock; double[] c = ret.denseBlock; for (int i = rl, ix = rl * n; i < ru; i++) for (int j = 0; j < n; j++, ix++) { double wij = w[ix]; if (wij != 0) { c[ix] = wumm(wij, mU, mV, i, j, flagmult, fn, cd); } } } } //////////////////////////////////////////// // performance-relevant utility functions // //////////////////////////////////////////// /** * Computes the dot-product of two vectors. Experiments (on long vectors of * 10^7 values) showed that this generic function provides equivalent performance * even for the specific case of dotProduct(a,a,len) as used for TSMM. * * @param a first vector * @param b second vector * @param len length * @return dot product of the two input vectors */ private static double dotProduct(double[] a, double[] b, final int len) { double val = 0; final int bn = len % 8; //compute rest for (int i = 0; i < bn; i++) val += a[i] * b[i]; //unrolled 8-block (for better instruction-level parallelism) for (int i = bn; i < len; i += 8) { //read 64B cachelines of a and b //compute cval' = sum(a * b) + cval val += a[i + 0] * b[i + 0] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4] + a[i + 5] * b[i + 5] + a[i + 6] * b[i + 6] + a[i + 7] * b[i + 7]; } //scalar result return val; } private static double dotProduct(double[] a, double[] b, int ai, int bi, final int len) { double val = 0; final int bn = len % 8; //compute rest for (int i = 0; i < bn; i++, ai++, bi++) val += a[ai] * b[bi]; //unrolled 8-block (for better instruction-level parallelism) for (int i = bn; i < len; i += 8, ai += 8, bi += 8) { //read 64B cachelines of a and b //compute cval' = sum(a * b) + cval val += a[ai + 0] * b[bi + 0] + a[ai + 1] * b[bi + 1] + a[ai + 2] * b[bi + 2] + a[ai + 3] * b[bi + 3] + a[ai + 4] * b[bi + 4] + a[ai + 5] * b[bi + 5] + a[ai + 6] * b[bi + 6] + a[ai + 7] * b[bi + 7]; } //scalar result return val; } private static double dotProduct(double[] a, double[] b, int[] aix, int ai, final int bi, final int len) { double val = 0; final int bn = len % 8; //compute rest for (int i = ai; i < ai + bn; i++) val += a[i] * b[bi + aix[i]]; //unrolled 8-block (for better instruction-level parallelism) for (int i = ai + bn; i < ai + len; i += 8) { //read 64B cacheline of a //read 64B of b via 'gather' //compute cval' = sum(a * b) + cval val += a[i + 0] * b[bi + aix[i + 0]] + a[i + 1] * b[bi + aix[i + 1]] + a[i + 2] * b[bi + aix[i + 2]] + a[i + 3] * b[bi + aix[i + 3]] + a[i + 4] * b[bi + aix[i + 4]] + a[i + 5] * b[bi + aix[i + 5]] + a[i + 6] * b[bi + aix[i + 6]] + a[i + 7] * b[bi + aix[i + 7]]; } //scalar result return val; } private static void vectMultiplyAdd(final double aval, double[] b, double[] c, int bi, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, bi++, ci++) c[ci] += aval * b[bi]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, bi += 8, ci += 8) { //read 64B cachelines of b and c //compute c' = aval * b + c //write back 64B cacheline of c = c' c[ci + 0] += aval * b[bi + 0]; c[ci + 1] += aval * b[bi + 1]; c[ci + 2] += aval * b[bi + 2]; c[ci + 3] += aval * b[bi + 3]; c[ci + 4] += aval * b[bi + 4]; c[ci + 5] += aval * b[bi + 5]; c[ci + 6] += aval * b[bi + 6]; c[ci + 7] += aval * b[bi + 7]; } } private static void vectMultiplyAdd2(final double aval1, final double aval2, double[] b, double[] c, int bi1, int bi2, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, bi1++, bi2++, ci++) c[ci] += aval1 * b[bi1] + aval2 * b[bi2]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, bi1 += 8, bi2 += 8, ci += 8) { //read 64B cachelines of b (2x) and c //compute c' = aval_1 * b_1 + aval_2 * b_2 + c //write back 64B cacheline of c = c' c[ci + 0] += aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0]; c[ci + 1] += aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1]; c[ci + 2] += aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2]; c[ci + 3] += aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3]; c[ci + 4] += aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4]; c[ci + 5] += aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5]; c[ci + 6] += aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6]; c[ci + 7] += aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7]; } } private static void vectMultiplyAdd3(final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, bi1++, bi2++, bi3++, ci++) c[ci] += aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, bi1 += 8, bi2 += 8, bi3 += 8, ci += 8) { //read 64B cachelines of b (3x) and c //compute c' = aval_1 * b_1 + aval_2 * b_2 + c //write back 64B cacheline of c = c' c[ci + 0] += aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0]; c[ci + 1] += aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1]; c[ci + 2] += aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2]; c[ci + 3] += aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3]; c[ci + 4] += aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4]; c[ci + 5] += aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5]; c[ci + 6] += aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6]; c[ci + 7] += aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7]; } } private static void vectMultiplyAdd4(final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, bi1++, bi2++, bi3++, bi4++, ci++) c[ci] += aval1 * b[bi1] + aval2 * b[bi2] + aval3 * b[bi3] + aval4 * b[bi4]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, bi1 += 8, bi2 += 8, bi3 += 8, bi4 += 8, ci += 8) { //read 64B cachelines of b (4x) and c //compute c' = aval_1 * b_1 + aval_2 * b_2 + c //write back 64B cacheline of c = c' c[ci + 0] += aval1 * b[bi1 + 0] + aval2 * b[bi2 + 0] + aval3 * b[bi3 + 0] + aval4 * b[bi4 + 0]; c[ci + 1] += aval1 * b[bi1 + 1] + aval2 * b[bi2 + 1] + aval3 * b[bi3 + 1] + aval4 * b[bi4 + 1]; c[ci + 2] += aval1 * b[bi1 + 2] + aval2 * b[bi2 + 2] + aval3 * b[bi3 + 2] + aval4 * b[bi4 + 2]; c[ci + 3] += aval1 * b[bi1 + 3] + aval2 * b[bi2 + 3] + aval3 * b[bi3 + 3] + aval4 * b[bi4 + 3]; c[ci + 4] += aval1 * b[bi1 + 4] + aval2 * b[bi2 + 4] + aval3 * b[bi3 + 4] + aval4 * b[bi4 + 4]; c[ci + 5] += aval1 * b[bi1 + 5] + aval2 * b[bi2 + 5] + aval3 * b[bi3 + 5] + aval4 * b[bi4 + 5]; c[ci + 6] += aval1 * b[bi1 + 6] + aval2 * b[bi2 + 6] + aval3 * b[bi3 + 6] + aval4 * b[bi4 + 6]; c[ci + 7] += aval1 * b[bi1 + 7] + aval2 * b[bi2 + 7] + aval3 * b[bi3 + 7] + aval4 * b[bi4 + 7]; } } @SuppressWarnings("unused") private static void vectMultiplyAdd(final double aval, double[] b, double[] c, int[] bix, final int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++) c[ci + bix[j]] += aval * b[j]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8) { //read 64B cacheline of b //read 64B of c via 'gather' //compute c' = aval * b + c //write back 64B of c = c' via 'scatter' c[ci + bix[j + 0]] += aval * b[j + 0]; c[ci + bix[j + 1]] += aval * b[j + 1]; c[ci + bix[j + 2]] += aval * b[j + 2]; c[ci + bix[j + 3]] += aval * b[j + 3]; c[ci + bix[j + 4]] += aval * b[j + 4]; c[ci + bix[j + 5]] += aval * b[j + 5]; c[ci + bix[j + 6]] += aval * b[j + 6]; c[ci + bix[j + 7]] += aval * b[j + 7]; } } private static void vectMultiplyAdd(final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = bi; j < bi + bn; j++) c[ci + bix[j]] += aval * b[j]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bi + bn; j < bi + len; j += 8) { //read 64B cacheline of b //read 64B of c via 'gather' //compute c' = aval * b + c //write back 64B of c = c' via 'scatter' c[ci + bix[j + 0]] += aval * b[j + 0]; c[ci + bix[j + 1]] += aval * b[j + 1]; c[ci + bix[j + 2]] += aval * b[j + 2]; c[ci + bix[j + 3]] += aval * b[j + 3]; c[ci + bix[j + 4]] += aval * b[j + 4]; c[ci + bix[j + 5]] += aval * b[j + 5]; c[ci + bix[j + 6]] += aval * b[j + 6]; c[ci + bix[j + 7]] += aval * b[j + 7]; } } private static void vectMultiplyWrite(final double aval, double[] b, double[] c, int bi, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, bi++, ci++) c[ci] = aval * b[bi]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, bi += 8, ci += 8) { //read 64B cachelines of b and c //compute c' = aval * b + c //write back 64B cacheline of c = c' c[ci + 0] = aval * b[bi + 0]; c[ci + 1] = aval * b[bi + 1]; c[ci + 2] = aval * b[bi + 2]; c[ci + 3] = aval * b[bi + 3]; c[ci + 4] = aval * b[bi + 4]; c[ci + 5] = aval * b[bi + 5]; c[ci + 6] = aval * b[bi + 6]; c[ci + 7] = aval * b[bi + 7]; } } @SuppressWarnings("unused") private static void vectMultiplyWrite(double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, ai++, bi++, ci++) c[ci] = a[ai] * b[bi]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, ai += 8, bi += 8, ci += 8) { //read 64B cachelines of a and b //compute c' = a * b //write back 64B cacheline of c = c' c[ci + 0] = a[ai + 0] * b[bi + 0]; c[ci + 1] = a[ai + 1] * b[bi + 1]; c[ci + 2] = a[ai + 2] * b[bi + 2]; c[ci + 3] = a[ai + 3] * b[bi + 3]; c[ci + 4] = a[ai + 4] * b[bi + 4]; c[ci + 5] = a[ai + 5] * b[bi + 5]; c[ci + 6] = a[ai + 6] * b[bi + 6]; c[ci + 7] = a[ai + 7] * b[bi + 7]; } } private static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, ai++, ci++) c[ci] *= a[ai]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, ai += 8, ci += 8) { //read 64B cachelines of a and c //compute c' = c * a //write back 64B cacheline of c = c' c[ci + 0] *= a[ai + 0]; c[ci + 1] *= a[ai + 1]; c[ci + 2] *= a[ai + 2]; c[ci + 3] *= a[ai + 3]; c[ci + 4] *= a[ai + 4]; c[ci + 5] *= a[ai + 5]; c[ci + 6] *= a[ai + 6]; c[ci + 7] *= a[ai + 7]; } } private static void vectAdd(double[] a, double[] c, int ai, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, ai++, ci++) c[ci] += a[ai]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, ai += 8, ci += 8) { //read 64B cachelines of a and c //compute c' = c * a //write back 64B cacheline of c = c' c[ci + 0] += a[ai + 0]; c[ci + 1] += a[ai + 1]; c[ci + 2] += a[ai + 2]; c[ci + 3] += a[ai + 3]; c[ci + 4] += a[ai + 4]; c[ci + 5] += a[ai + 5]; c[ci + 6] += a[ai + 6]; c[ci + 7] += a[ai + 7]; } } private static void vectAdd4(double[] a1, double[] a2, double[] a3, double[] a4, double[] c, int ai, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, ai++, ci++) c[ci] += a1[ai] + a2[ai] + a3[ai] + a4[ai]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, ai += 8, ci += 8) { //read 64B cachelines of a (4x) and c //compute c' = c + a1 + a2 + a3 + a4 //write back 64B cacheline of c = c' c[ci + 0] += a1[ai + 0] + a2[ai + 0] + a3[ai + 0] + a4[ai + 0]; c[ci + 1] += a1[ai + 1] + a2[ai + 1] + a3[ai + 1] + a4[ai + 1]; c[ci + 2] += a1[ai + 2] + a2[ai + 2] + a3[ai + 2] + a4[ai + 2]; c[ci + 3] += a1[ai + 3] + a2[ai + 3] + a3[ai + 3] + a4[ai + 3]; c[ci + 4] += a1[ai + 4] + a2[ai + 4] + a3[ai + 4] + a4[ai + 4]; c[ci + 5] += a1[ai + 5] + a2[ai + 5] + a3[ai + 5] + a4[ai + 5]; c[ci + 6] += a1[ai + 6] + a2[ai + 6] + a3[ai + 6] + a4[ai + 6]; c[ci + 7] += a1[ai + 7] + a2[ai + 7] + a3[ai + 7] + a4[ai + 7]; } } private static void vectSubtract(double[] a, double[] c, int ai, int ci, final int len) { final int bn = len % 8; //rest, not aligned to 8-blocks for (int j = 0; j < bn; j++, ai++, ci++) c[ci] -= a[ai]; //unrolled 8-block (for better instruction-level parallelism) for (int j = bn; j < len; j += 8, ai += 8, ci += 8) { //read 64B cachelines of a and c //compute c' = c * a //write back 64B cacheline of c = c' c[ci + 0] -= a[ai + 0]; c[ci + 1] -= a[ai + 1]; c[ci + 2] -= a[ai + 2]; c[ci + 3] -= a[ai + 3]; c[ci + 4] -= a[ai + 4]; c[ci + 5] -= a[ai + 5]; c[ci + 6] -= a[ai + 6]; c[ci + 7] -= a[ai + 7]; } } private static double wsigmoid(final double wij, double[] u, double[] v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len) { //compute dot product over ui vj double uvij = dotProduct(u, v, uix, vix, len); //compute core sigmoid function double cval = flagminus ? 1 / (1 + FastMath.exp(uvij)) : 1 / (1 + FastMath.exp(-uvij)); //compute weighted output return wij * ((flaglog) ? FastMath.log(cval) : cval); } private static double wsigmoid(final double wij, MatrixBlock u, MatrixBlock v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len) { //compute dot product over ui vj double uvij = dotProductGeneric(u, v, uix, vix, len); //compute core sigmoid function double cval = flagminus ? 1 / (1 + FastMath.exp(uvij)) : 1 / (1 + FastMath.exp(-uvij)); //compute weighted output return wij * ((flaglog) ? FastMath.log(cval) : cval); } private static void wdivmm(final double wij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean mult, final boolean minus, final int len) { //compute dot product over ui vj double uvij = dotProduct(u, v, uix, vix, len); //compute core wdivmm double tmpval = minus ? uvij - wij : mult ? wij * uvij : wij / uvij; //prepare inputs for final mm int bix = left ? uix : vix; int cix = left ? vix : uix; double[] b = left ? u : v; //compute final mm output vectMultiplyAdd(tmpval, b, c, bix, cix, len); } private static void wdivmm(final double wij, final double xij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean scalar, final int len) { //compute dot product over ui vj double uvij = dotProduct(u, v, uix, vix, len); //compute core wdivmm double tmpval = scalar ? wij / (uvij + xij) : wij * (uvij - xij); //prepare inputs for final mm int bix = left ? uix : vix; int cix = left ? vix : uix; double[] b = left ? u : v; //compute final mm output vectMultiplyAdd(tmpval, b, c, bix, cix, len); } private static void wdivmm(final double wij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, boolean mult, final boolean minus, final int len) { //compute dot product over ui vj double uvij = dotProductGeneric(u, v, uix, vix, len); //compute core wdivmm double wtmp = minus ? uvij - wij : mult ? wij * uvij : wij / uvij; //prepare inputs for final mm int bix = left ? uix : vix; int cix = left ? vix * len : uix * len; MatrixBlock b = left ? u : v; //compute final mm for (int k2 = 0; k2 < len; k2++) c[cix + k2] += b.quickGetValue(bix, k2) * wtmp; } private static void wdivmm(final double wij, final double xij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, final boolean scalar, final int len) { //compute dot product over ui vj double uvij = dotProductGeneric(u, v, uix, vix, len); //compute core wdivmm double wtmp = scalar ? wij / (uvij + xij) : wij * (uvij - xij); //prepare inputs for final mm int bix = left ? uix : vix; int cix = left ? vix * len : uix * len; MatrixBlock b = left ? u : v; //compute final mm for (int k2 = 0; k2 < len; k2++) c[cix + k2] += b.quickGetValue(bix, k2) * wtmp; } private static double wumm(final double wij, double[] u, double[] v, final int uix, final int vix, final boolean flagmult, ValueFunction fn, final int len) throws DMLRuntimeException { //compute dot product over ui vj double uvij = dotProduct(u, v, uix, vix, len); //compute unary operations double cval = fn.execute(uvij); //compute weighted output return flagmult ? wij * cval : wij / cval; } private static double wumm(final double wij, MatrixBlock u, MatrixBlock v, final int uix, final int vix, final boolean flagmult, ValueFunction fn, final int len) throws DMLRuntimeException { //compute dot product over ui vj double uvij = dotProductGeneric(u, v, uix, vix, len); //compute unary operations double cval = fn.execute(uvij); //compute weighted output return flagmult ? wij * cval : wij / cval; } private static double dotProductGeneric(MatrixBlock a, MatrixBlock b, final int ai, final int bi, int len) { double val = 0; for (int k2 = 0; k2 < len; k2++) val += a.quickGetValue(ai, k2) * b.quickGetValue(bi, k2); return val; } /** * Used for all version of TSMM where the result is known to be symmetric. * Hence, we compute only the upper triangular matrix and copy this partial * result down to lower triangular matrix once. * * @param ret matrix */ private static void copyUpperToLowerTriangle(MatrixBlock ret) { double[] c = ret.denseBlock; final int m = ret.rlen; final int n = ret.clen; //copy symmetric values for (int i = 0, uix = 0; i < m; i++, uix += n) for (int j = i + 1, lix = j * n + i; j < n; j++, lix += n) c[lix] = c[uix + j]; } private static MatrixBlock prepMatrixMultTransposeSelfInput(MatrixBlock m1, boolean leftTranspose) throws DMLRuntimeException { MatrixBlock ret = m1; if (!leftTranspose && m1.sparse && m1.rlen > 1) //X%*%t(X) SPARSE MATRIX { //directly via LibMatrixReorg in order to prevent sparsity change MatrixBlock tmpBlock = new MatrixBlock(m1.clen, m1.rlen, m1.sparse); LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject())); ret = tmpBlock; } return ret; } private static boolean checkPrepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) { //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse && m1.rlen > m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64 && 8 * m2.rlen * m2.clen < 256 * 1024); //rhs fits in L2 cache } private static boolean checkParMatrixMultRightInputRows(MatrixBlock m1, MatrixBlock m2, int k) { //parallelize over rows in rhs matrix if number of rows in lhs/output is very small return (m1.rlen == 1 && LOW_LEVEL_OPTIMIZATION && m2.clen > 1 && !(m1.isUltraSparse() || m2.isUltraSparse())) || (m1.rlen <= 16 && LOW_LEVEL_OPTIMIZATION && m2.clen > 1 && m2.rlen > m1.rlen && (!m1.isUltraSparse() && !m2.sparse) //dense-dense / sparse/dense && (long) k * 8 * m1.rlen * m2.clen < MEM_OVERHEAD_THRESHOLD); } private static boolean checkParMatrixMultRightInputCols(MatrixBlock m1, MatrixBlock m2, int k, boolean pm2r) { //parallelize over cols in rhs matrix if dense, number of cols in rhs is large, and lhs fits in l2 return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse && m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r && 8 * m1.rlen * m1.clen < 256 * 1024); //lhs fits in L2 cache } private static MatrixBlock prepMatrixMultRightInput(MatrixBlock m1, MatrixBlock m2) throws DMLRuntimeException { MatrixBlock ret = m2; //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output if (checkPrepMatrixMultRightInput(m1, m2)) { MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse); LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject())); ret = tmpBlock; } return ret; } private static int copyNonZeroElements(double[] a, final int aixi, final int bk, final int bj, final int n, double[] tmpa, int[] tmpbi, final int bklen) { int knnz = 0; for (int k = 0; k < bklen; k++) if (a[aixi + k] != 0) { tmpa[knnz] = a[aixi + k]; tmpbi[knnz] = (bk + k) * n + bj; //scan index on b knnz++; } return knnz; } private static int copyNonZeroElements(double[] a, int aixi, final int bk, final int bj, final int n, final int nx, double[] tmpa, int[] tmpbi, final int bklen) { int knnz = 0; for (int k = 0; k < bklen; k++, aixi += n) if (a[aixi] != 0) { tmpa[knnz] = a[aixi]; tmpbi[knnz] = (bk + k) * nx + bj; //scan index on b knnz++; } return knnz; } private static void sumScalarResults(List<Future<Double>> tasks, MatrixBlock ret) throws InterruptedException, ExecutionException { //aggregate partial results and check for errors double val = 0; for (Future<Double> task : tasks) val += task.get(); ret.quickSetValue(0, 0, val); } @SuppressWarnings("unused") private static void sumDenseResults(double[][] partret, double[] ret) { final int len = ret.length; final int k = partret.length; final int bk = k % 4; final int blocksize = 2 * 1024; //16KB (half of common L1 data) //cache-conscious aggregation to prevent repreated scans/writes of ret for (int bi = 0; bi < len; bi += blocksize) { int llen = Math.min(len - bi, blocksize); //aggregate next block from all partial results for (int j = 0; j < bk; j++) //rest (not aligned to 4) vectAdd(partret[j], ret, bi, bi, llen); for (int j = bk; j < k; j += 4) //4 partial results at a time vectAdd4(partret[j], partret[j + 1], partret[j + 2], partret[j + 3], ret, bi, bi, llen); } } private static ArrayList<Integer> getBalancedBlockSizes(int len, int k) { ArrayList<Integer> ret = new ArrayList<Integer>(); int base = len / k; int rest = len % k; for (int i = 0; i < k; i++) { int val = base + (i < rest ? 1 : 0); if (val > 0) ret.add(val); } return ret; } ///////////////////////////////////////////////////////// // Task Implementations for Multi-Threaded Operations // ///////////////////////////////////////////////////////// private static class MatrixMultTask implements Callable<Object> { private MatrixBlock _m1 = null; private MatrixBlock _m2 = null; private MatrixBlock _ret = null; private boolean _tm2 = false; //transposed m2 private boolean _pm2r = false; //par over m2 rows private boolean _pm2c = false; //par over m2 rows private int _rl = -1; private int _ru = -1; protected MatrixMultTask(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2r, boolean pm2c, int rl, int ru) { _m1 = m1; _m2 = m2; _tm2 = tm2; _pm2r = pm2r; _pm2c = pm2c; _rl = rl; _ru = ru; if (pm2r) { //vector-matrix / matrix-matrix //allocate local result for partial aggregation _ret = new MatrixBlock(ret.rlen, ret.clen, false); } else { //default case _ret = ret; } } @Override public Object call() throws DMLRuntimeException { //setup target index ranges int rl = _pm2c ? 0 : _rl; int ru = _pm2c ? _m1.rlen : _ru; int cl = _pm2c ? _rl : 0; int cu = _pm2c ? _ru : _ret.clen; //thread-local allocation if (_pm2r) _ret.allocateDenseBlock(); //compute block matrix multiplication if (_m1.isUltraSparse() || _m2.isUltraSparse()) matrixMultUltraSparse(_m1, _m2, _ret, rl, ru); else if (!_m1.sparse && !_m2.sparse) matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu); else if (_m1.sparse && _m2.sparse) matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, rl, ru); else if (_m1.sparse) matrixMultSparseDense(_m1, _m2, _ret, _pm2r, rl, ru); else matrixMultDenseSparse(_m1, _m2, _ret, _pm2r, rl, ru); //maintain block nnz (upper bounds inclusive) if (!_pm2r) return _ret.recomputeNonZeros(rl, ru - 1, cl, cu - 1); else return _ret.getDenseBlock(); } } private static class MatrixMultChainTask implements Callable<double[]> { private MatrixBlock _m1 = null; private MatrixBlock _m2 = null; private MatrixBlock _m3 = null; private ChainType _ct = null; private int _rl = -1; private int _ru = -1; protected MatrixMultChainTask(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, ChainType ct, int rl, int ru) throws DMLRuntimeException { _m1 = mX; _m2 = mV; _m3 = mW; _ct = ct; _rl = rl; _ru = ru; } @Override public double[] call() throws DMLRuntimeException { //thread-local allocation for partial aggregation MatrixBlock ret = new MatrixBlock(1, _m1.clen, false); ret.allocateDenseBlock(); if (_m1.sparse) matrixMultChainSparse(_m1, _m2, _m3, ret, _ct, _rl, _ru); else matrixMultChainDense(_m1, _m2, _m3, ret, _ct, _rl, _ru); //NOTE: we dont do global aggregation from concurrent tasks in order //to prevent synchronization (sequential aggregation led to better //performance after JIT) return ret.getDenseBlock(); } } private static class MatrixMultTransposeTask implements Callable<Object> { private MatrixBlock _m1 = null; private MatrixBlock _ret = null; private boolean _left = true; private int _rl = -1; private int _ru = -1; protected MatrixMultTransposeTask(MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru) { _m1 = m1; _ret = ret; _left = left; _rl = rl; _ru = ru; } @Override public Object call() throws DMLRuntimeException { if (_m1.sparse) matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru); else matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru); return null; } } private static class MatrixMultPermuteTask implements Callable<Object> { private MatrixBlock _pm1 = null; private MatrixBlock _m2 = null; private MatrixBlock _ret1 = null; private MatrixBlock _ret2 = null; private int _rl = -1; private int _ru = -1; protected MatrixMultPermuteTask(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru) { _pm1 = pm1; _m2 = m2; _ret1 = ret1; _ret2 = ret2; _rl = rl; _ru = ru; } @Override public Object call() throws DMLRuntimeException { if (_m2.sparse) matrixMultPermuteSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru); else if (_ret1.sparse) matrixMultPermuteDenseSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru); else matrixMultPermuteDense(_pm1, _m2, _ret1, _ret2, _rl, _ru); return null; } } private static class MatrixMultWSLossTask implements Callable<Double> { private MatrixBlock _mX = null; private MatrixBlock _mU = null; private MatrixBlock _mV = null; private MatrixBlock _mW = null; private MatrixBlock _ret = null; private WeightsType _wt = null; private int _rl = -1; private int _ru = -1; protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightsType wt, int rl, int ru) throws DMLRuntimeException { _mX = mX; _mU = mU; _mV = mV; _mW = mW; _wt = wt; _rl = rl; _ru = ru; //allocate local result for partial aggregation _ret = new MatrixBlock(1, 1, false); _ret.allocateDenseBlock(); } @Override public Double call() throws DMLRuntimeException { if (!_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW == null || !_mW.sparse) && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() && (_mW == null || !_mW.isEmptyBlock())) matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); else if (_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW == null || _mW.sparse) && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() && (_mW == null || !_mW.isEmptyBlock())) matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); else matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru); return _ret.quickGetValue(0, 0); } } private static class MatrixMultWSigmoidTask implements Callable<Long> { private MatrixBlock _mW = null; private MatrixBlock _mU = null; private MatrixBlock _mV = null; private MatrixBlock _ret = null; private WSigmoidType _wt = null; private int _rl = -1; private int _ru = -1; protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru) throws DMLRuntimeException { _mW = mW; _mU = mU; _mV = mV; _ret = ret; _wt = wt; _rl = rl; _ru = ru; } @Override public Long call() throws DMLRuntimeException { //core weighted square sum mm computation if (!_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWSigmoidDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru); else if (_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWSigmoidSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru); else matrixMultWSigmoidGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru); //maintain block nnz (upper bounds inclusive) return _ret.recomputeNonZeros(_rl, _ru - 1, 0, _ret.getNumColumns() - 1); } } private static class MatrixMultWDivTask implements Callable<Long> { private MatrixBlock _mW = null; private MatrixBlock _mU = null; private MatrixBlock _mV = null; private MatrixBlock _mX = null; private MatrixBlock _ret = null; private WDivMMType _wt = null; private int _rl = -1; private int _ru = -1; private int _cl = -1; private int _cu = -1; protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) throws DMLRuntimeException { _mW = mW; _mU = mU; _mV = mV; _mX = mX; _wt = wt; _rl = rl; _ru = ru; _cl = cl; _cu = cu; _ret = ret; } @Override public Long call() throws DMLRuntimeException { //core weighted div mm computation boolean scalarX = _wt.hasScalar(); if (!_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX == null || !_mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWDivMMDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); else if (_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX == null || _mX.sparse || scalarX) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWDivMMSparseDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); else matrixMultWDivMMGeneric(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu); //maintain partial nnz for right (upper bounds inclusive) int rl = _wt.isLeft() ? _cl : _rl; int ru = _wt.isLeft() ? _cu : _ru; return _ret.recomputeNonZeros(rl, ru - 1, 0, _ret.getNumColumns() - 1); } } private static class MatrixMultWCeTask implements Callable<Double> { private MatrixBlock _mW = null; private MatrixBlock _mU = null; private MatrixBlock _mV = null; private double _eps = 0.0; private MatrixBlock _ret = null; private WCeMMType _wt = null; private int _rl = -1; private int _ru = -1; protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, WCeMMType wt, int rl, int ru) throws DMLRuntimeException { _mW = mW; _mU = mU; _mV = mV; _eps = eps; _wt = wt; _rl = rl; _ru = ru; //allocate local result for partial aggregation _ret = new MatrixBlock(1, 1, false); _ret.allocateDenseBlock(); } @Override public Double call() throws DMLRuntimeException { //core weighted cross entropy mm computation if (!_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWCeMMDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); else if (_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWCeMMSparseDense(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); else matrixMultWCeMMGeneric(_mW, _mU, _mV, _eps, _ret, _wt, _rl, _ru); return _ret.quickGetValue(0, 0); } } private static class MatrixMultWuTask implements Callable<Long> { private MatrixBlock _mW = null; private MatrixBlock _mU = null; private MatrixBlock _mV = null; private MatrixBlock _ret = null; private WUMMType _wt = null; private ValueFunction _fn = null; private int _rl = -1; private int _ru = -1; protected MatrixMultWuTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int rl, int ru) throws DMLRuntimeException { _mW = mW; _mU = mU; _mV = mV; _ret = ret; _wt = wt; _fn = fn; _rl = rl; _ru = ru; } @Override public Long call() throws DMLRuntimeException { //core weighted square sum mm computation if (!_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWuMMDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); else if (_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()) matrixMultWuMMSparseDense(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); else matrixMultWuMMGeneric(_mW, _mU, _mV, _ret, _wt, _fn, _rl, _ru); //maintain block nnz (upper bounds inclusive) return _ret.recomputeNonZeros(_rl, _ru - 1, 0, _ret.getNumColumns() - 1); } } }