Java tutorial
/* * Copyright (c) 2018 by Andrew Charneski. * * The author licenses this file to you under the * Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance * with the License. You may obtain a copy * of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.simiacryptus.mindseye.layers.cudnn; import com.google.gson.JsonObject; import com.simiacryptus.mindseye.lang.*; import com.simiacryptus.mindseye.lang.cudnn.*; import jcuda.jcudnn.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.stream.IntStream; /** * A dense matrix operator using vector-matrix multiplication. Represents a fully connected key of synapses, where all * inputs are connected to all outputs via seperate coefficients. */ @SuppressWarnings("serial") public class GramianLayer extends LayerBase implements MultiPrecision<GramianLayer> { private static final Logger log = LoggerFactory.getLogger(GramianLayer.class); private Precision precision = Precision.Double; private double alpha = 1.0; /** * Instantiates a new Img eval key. */ public GramianLayer() { } /** * Instantiates a new Img eval key. * * @param json the json * @param rs the rs */ protected GramianLayer(@Nonnull final JsonObject json, Map<CharSequence, byte[]> rs) { super(json); this.precision = Precision.valueOf(json.getAsJsonPrimitive("precision").getAsString()); this.alpha = json.getAsJsonPrimitive("alpha").getAsDouble(); } /** * From json img eval key. * * @param json the json * @param rs the rs * @return the img eval key */ public static GramianLayer fromJson(@Nonnull final JsonObject json, Map<CharSequence, byte[]> rs) { return new GramianLayer(json, rs); } @Nullable @Override public Result evalAndFree(final Result... inObj) { assert 1 == inObj.length; TensorList inputData = inObj[0].getData(); int[] inputDimensions = inputData.getDimensions(); assert 3 == inputDimensions.length; return new Result(CudaSystem.run(gpu -> { CudaTensor tensor = gpu.getTensor(inputData, precision, MemoryType.Device, false); CudaTensorList output = getOutput(gpu, tensor); tensor.freeRef(); return output; }, inputData), (@Nonnull final DeltaSet<UUID> buffer, @Nonnull final TensorList delta) -> { @Nonnull final int[] outputDimensions = { 1, 1, inputDimensions[2] * inputDimensions[2] }; if (!Arrays.equals(delta.getDimensions(), outputDimensions)) { throw new AssertionError( Arrays.toString(delta.getDimensions()) + " != " + Arrays.toString(outputDimensions)); } if (inObj[0].isAlive()) { final TensorList passbackTensorList = CudaSystem.run(gpu -> { @Nullable final CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false); CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, true); delta.freeRef(); CudaTensorList feedback = getFeedback(gpu, inputTensor, deltaTensor); deltaTensor.freeRef(); inputTensor.freeRef(); return feedback; }, delta); inObj[0].accumulate(buffer, passbackTensorList); } else { delta.freeRef(); } }) { @Override public final void accumulate(DeltaSet<UUID> buffer, TensorList delta) { getAccumulator().accept(buffer, delta); } @Override protected void _free() { inputData.freeRef(); Arrays.stream(inObj).forEach(nnResult -> nnResult.freeRef()); } @Override public boolean isAlive() { return Arrays.stream(inObj).anyMatch(x -> x.isAlive()); } }; } /** * Gets feedback. * * @param gpu the gpu * @param inputTensor the input tensor * @param deltaTensor the evalInputDelta tensor * @return the feedback */ @Nonnull public CudaTensorList getFeedback(final CudnnHandle gpu, final CudaTensor inputTensor, final CudaTensor deltaTensor) { int pixels = inputTensor.descriptor.height * inputTensor.descriptor.width; CudaMemory inputMemory = inputTensor.getMemory(gpu); CudaMemory deltaMemory = deltaTensor.getMemory(gpu); @Nonnull final int[] inputDimensions = { inputTensor.descriptor.width, inputTensor.descriptor.height, inputTensor.descriptor.channels }; final int length = inputTensor.descriptor.batchCount; final int bands = inputDimensions[2]; @Nullable final CudaMemory bufferMemory = gpu .allocate((long) inputTensor.descriptor.nStride * length * precision.size, MemoryType.Device, true); @Nonnull final CudaDevice.CudaTensorDescriptor bufferDescriptor = gpu.newTensorDescriptor(precision, length, bands, inputDimensions[1], inputDimensions[0], inputDimensions[0] * inputDimensions[1] * bands, // inputDimensions[0] * inputDimensions[1], // inputDimensions[0], // 1); @Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, bands, inputDimensions[1], inputDimensions[0], inputDimensions[0] * inputDimensions[1] * bands, // inputDimensions[0] * inputDimensions[1], // inputDimensions[0], // 1); @Nullable final CudaMemory outputMemory = gpu.allocate((long) outputDescriptor.nStride * precision.size * length, MemoryType.Managed, true); @Nonnull final CudaMemory workspacePtr = gpu.allocate(Math.max(outputMemory.size, inputMemory.size), MemoryType.Device, true); @Nonnull final CudaMemory indexPtr = gpu.allocate(12 * length, MemoryType.Device, false); @Nonnull final CudaResource<cudnnOpTensorDescriptor> multiplyDescriptor = gpu .newOpDescriptor(cudnnOpTensorOp.CUDNN_OP_TENSOR_MUL, precision); CudaResource<cudnnReduceTensorDescriptor> reduceAddDescriptor = gpu.cudnnCreateReduceTensorDescriptor( cudnnReduceTensorOp.CUDNN_REDUCE_TENSOR_ADD, precision.code, cudnnNanPropagation.CUDNN_NOT_PROPAGATE_NAN, cudnnReduceTensorIndices.CUDNN_REDUCE_TENSOR_NO_INDICES, cudnnIndicesType.CUDNN_32BIT_INDICES); @Nonnull final CudaDevice.CudaTensorDescriptor bandDescriptor = gpu.newTensorDescriptor(precision, length, 1, inputDimensions[1], inputDimensions[0], inputDimensions[2] * inputDimensions[1] * inputDimensions[0], inputDimensions[1] * inputDimensions[0], inputDimensions[0], 1); @Nonnull final CudaDevice.CudaTensorDescriptor viewDescriptor1 = gpu.newTensorDescriptor(precision, length, bands, 1, 1, // deltaTensor.descriptor.nStride, // deltaTensor.descriptor.cStride, // deltaTensor.descriptor.hStride, // deltaTensor.descriptor.wStride); @Nonnull final CudaDevice.CudaTensorDescriptor viewDescriptor2 = gpu.newTensorDescriptor(precision, length, bands, 1, 1, // deltaTensor.descriptor.nStride, // deltaTensor.descriptor.cStride * bands, // deltaTensor.descriptor.hStride, // deltaTensor.descriptor.wStride // ); IntStream.range(0, bands).forEach(band -> { CudaMemory deltaView1 = deltaMemory.withByteOffset(band * precision.size * bands); CudaSystem.handle(gpu.cudnnOpTensor(multiplyDescriptor.getPtr(), precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputMemory.getPtr(), precision.getPointer(1.0), viewDescriptor1.getPtr(), deltaView1.getPtr(), precision.getPointer(0.0), bufferDescriptor.getPtr(), bufferMemory.getPtr())); inputMemory.dirty(); deltaView1.dirty(); bufferMemory.dirty(); deltaView1.freeRef(); CudaMemory deltaView2 = deltaMemory.withByteOffset(band * precision.size); CudaSystem.handle(gpu.cudnnOpTensor(multiplyDescriptor.getPtr(), precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputMemory.getPtr(), precision.getPointer(1.0), viewDescriptor2.getPtr(), deltaView2.getPtr(), precision.getPointer(1.0), bufferDescriptor.getPtr(), bufferMemory.getPtr())); inputMemory.dirty(); deltaView2.dirty(); bufferMemory.dirty(); deltaView2.freeRef(); CudaMemory outputViewMem = outputMemory.withByteOffset(bandDescriptor.cStride * band * precision.size); gpu.cudnnReduceTensor(reduceAddDescriptor.getPtr(), indexPtr.getPtr(), indexPtr.size, workspacePtr.getPtr(), workspacePtr.size, precision.getPointer(alpha / pixels), bufferDescriptor.getPtr(), bufferMemory.getPtr(), precision.getPointer(0.0), bandDescriptor.getPtr(), outputViewMem.getPtr()); outputViewMem.dirty(); bufferMemory.dirty(); outputViewMem.freeRef(); }); CudaTensorList feedback = CudaTensorList.wrap(CudaTensor.wrap(outputMemory, outputDescriptor, precision), length, inputDimensions, precision); bandDescriptor.freeRef(); viewDescriptor1.freeRef(); viewDescriptor2.freeRef(); workspacePtr.freeRef(); indexPtr.freeRef(); reduceAddDescriptor.freeRef(); inputMemory.freeRef(); multiplyDescriptor.freeRef(); deltaMemory.freeRef(); bufferMemory.freeRef(); bufferDescriptor.freeRef(); return feedback; } /** * Gets output. * * @param gpu the gpu * @param inputTensor the input tensor * @return the output */ @Nonnull public CudaTensorList getOutput(final CudnnHandle gpu, final CudaTensor inputTensor) { int pixels = inputTensor.descriptor.height * inputTensor.descriptor.width; @Nonnull final int[] inputDimensions = { inputTensor.descriptor.width, inputTensor.descriptor.height, inputTensor.descriptor.channels }; final int length = inputTensor.descriptor.batchCount; final int bands = inputDimensions[2]; @Nonnull final int[] outputDimensions = { 1, 1, bands * bands }; CudaMemory inputMemory = inputTensor.getMemory(gpu); @Nonnull final CudaDevice.CudaTensorDescriptor ouputDescriptor = gpu.newTensorDescriptor(precision, length, bands * bands, 1, 1, bands * bands, // 1, // 1, // 1); @Nullable final CudaMemory outputMemory = gpu.allocate((long) ouputDescriptor.nStride * precision.size * length, MemoryType.Device, true); @Nonnull final CudaDevice.CudaTensorDescriptor bufferDescriptor = gpu.newTensorDescriptor(precision, length, bands, inputDimensions[1], inputDimensions[0], inputDimensions[0] * inputDimensions[1] * bands, // inputDimensions[0] * inputDimensions[1], // inputDimensions[0], // 1); @Nullable final CudaMemory bufferMemory = gpu.allocate((long) bufferDescriptor.nStride * length * precision.size, MemoryType.Device, true); @Nonnull final CudaDevice.CudaTensorDescriptor inputViewDescriptor = gpu.newTensorDescriptor(precision, length, 1, inputDimensions[1], inputDimensions[0], inputTensor.descriptor.nStride, // inputTensor.descriptor.cStride, // inputTensor.descriptor.hStride, // inputTensor.descriptor.wStride); CudaResource<cudnnReduceTensorDescriptor> reduceAddDescriptor = gpu.cudnnCreateReduceTensorDescriptor( cudnnReduceTensorOp.CUDNN_REDUCE_TENSOR_ADD, precision.code, cudnnNanPropagation.CUDNN_NOT_PROPAGATE_NAN, cudnnReduceTensorIndices.CUDNN_REDUCE_TENSOR_NO_INDICES, cudnnIndicesType.CUDNN_32BIT_INDICES); @Nonnull final CudaDevice.CudaTensorDescriptor outputViewDescriptor = gpu.newTensorDescriptor(precision, length, bands, 1, 1, bands * bands, 1, 1, 1); @Nonnull final CudaResource<cudnnOpTensorDescriptor> multiplyDescriptor = gpu .newOpDescriptor(cudnnOpTensorOp.CUDNN_OP_TENSOR_MUL, precision); @Nonnull final CudaMemory workspacePtr = gpu.allocate(Math.max(outputMemory.size, inputMemory.size), MemoryType.Device, true); @Nonnull final CudaMemory indexPtr = gpu.allocate((long) 12 * length, MemoryType.Device, true); IntStream.range(0, inputDimensions[2]).forEach(band -> { CudaMemory inputView = inputMemory .withByteOffset(band * precision.size * inputTensor.descriptor.cStride); CudaSystem.handle(gpu.cudnnOpTensor(multiplyDescriptor.getPtr(), precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputMemory.getPtr(), precision.getPointer(1.0), inputViewDescriptor.getPtr(), inputView.getPtr(), precision.getPointer(0.0), bufferDescriptor.getPtr(), bufferMemory.getPtr())); bufferMemory.dirty(); inputView.dirty(); inputMemory.dirty(); inputView.freeRef(); CudaMemory outputView = outputMemory.withByteOffset(band * precision.size * bands); CudaSystem.handle(gpu.cudnnReduceTensor(reduceAddDescriptor.getPtr(), indexPtr.getPtr(), indexPtr.size, workspacePtr.getPtr(), workspacePtr.size, precision.getPointer(alpha / pixels), bufferDescriptor.getPtr(), bufferMemory.getPtr(), precision.getPointer(0.0), outputViewDescriptor.getPtr(), outputView.getPtr())); outputView.dirty(); bufferMemory.dirty(); outputView.freeRef(); }); outputMemory.dirty(); bufferMemory.dirty(); inputMemory.dirty(); bufferMemory.freeRef(); multiplyDescriptor.freeRef(); inputMemory.freeRef(); bufferDescriptor.freeRef(); inputViewDescriptor.freeRef(); outputViewDescriptor.freeRef(); reduceAddDescriptor.freeRef(); workspacePtr.freeRef(); indexPtr.freeRef(); return CudaTensorList.wrap(CudaTensor.wrap(outputMemory, ouputDescriptor, precision), length, outputDimensions, precision); } @Nonnull @Override public JsonObject getJson(Map<CharSequence, byte[]> resources, @Nonnull DataSerializer dataSerializer) { @Nonnull final JsonObject json = super.getJsonStub(); json.addProperty("precision", precision.name()); json.addProperty("alpha", alpha); return json; } @Nonnull @Override public List<double[]> state() { return Arrays.asList(); } @Override public Precision getPrecision() { return precision; } @Nonnull @Override public GramianLayer setPrecision(final Precision precision) { this.precision = precision; return this; } /** * Gets alphaList. * * @return the alphaList */ public double getAlpha() { return alpha; } /** * Sets alphaList. * * @param alpha the alphaList * @return the alphaList */ public GramianLayer setAlpha(final double alpha) { this.alpha = alpha; return this; } }