Java tutorial
/* * JCudaUtils - Utilities for JCuda * http://www.jcuda.org * * Copyright (c) 2010 Marco Hutter - http://www.jcuda.org * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package jcuda.utils; import com.google.common.io.ByteStreams; import jcuda.CudaException; import jcuda.Pointer; import jcuda.driver.*; import jcuda.runtime.JCuda; import jcuda.runtime.dim3; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.nd4j.linalg.jcublas.context.ContextHolder; import org.nd4j.linalg.jcublas.kernel.KernelFunctionLoader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.naming.Context; import java.io.*; import static jcuda.driver.JCudaDriver.*; /** * This is a utility class that simplifies the setup and launching * of CUDA kernels using the JCuda Driver API. <br /> * <br /> * Instances of this class may be created using one of the following * methods: <br /> * <ul> * <li> * {@link KernelLauncher#compile(String, String, String...)} will * compile a kernel from a String containing the CUDA source code * </li> * <li> * {@link KernelLauncher#create(String, String, String...)} will * create a kernel for a function that is contained in a CUDA * source file * </li> * <li> * {@link KernelLauncher#load(String, String)} will load a kernel from * a PTX or CUBIN (CUDA binary) file. * </li> * <li> * {@link KernelLauncher#load(InputStream, String)} will load a kernel * from PTX- or CUBIN data which is provided via an InputStream * (useful for packaging PTX- or CUBIN files into JAR archives)<br /> * </li> * </ul> * * <br /> * These instances may then be used to call a kernel function with * the {@link KernelLauncher#call(Object...)} method. The actual * kernel function arguments which are passed to this method * will be set up automatically, and aligned appropriately for * their respective size.<br /> * <br /> * The setup of the execution may be performed similarly as the invocation * of a kernel when using the Runtime API in C. Such a call has the * form<br /> * <code> * kernel<<<gridDim, blockDim, * sharedMemorySize, stream>>>(...); * </code> * <br /> * where * <ul> * <li> * <b>gridDim</b> is a dim3 which specifies the number of blocks per * grid * </li> * <li> * <b>blockDim</b> is a dim3 that specifies the number of threads * per block * </li> * <li> * <b>sharedMemorySize</b> is the size of the shared memory for * the kernel * </li> * <li> * <b>stream</b> is a stream for asynchronous kernel execution * </li> * </ul> * Similarly, the KernelLauncher allows specifying these parameters * in the {@link KernelLauncher#setup(dim3, dim3, int, CUstream)} * method: <br /> * <br /> * <code> * kernelLauncher.setup(gridDim, * blockDim, sharedMemorySize, stream).call(...); * </code> * <br /> * <br /> * When default values for some of the parameters should be used, * one of the overloaded versions of the setup method may be called: * <br /> * <br /> * <code> * kernelLauncher.setup(gridDim, * blockDim).call(kernel); * </code> * <br /> * <br /> * The parameters may also be set individually:<br /> * <br /> * <code> * kernelLauncher.setGridSize(gridSize);<br /> * kernelLauncher.setBlockSize(blockSize);<br /> * kernelLauncher.call(...); * </code> * <br /> */ public class KernelLauncher { public final static String FUNCTION_NAME = "transform"; /** * The logger used in this class */ private static final Logger logger = LoggerFactory.getLogger(KernelLauncher.class.getName()); /** * The path prefix, containing the path to the NVCC compiler. * Not required if the path to the NVCC is present in an * environment variable. */ private static String compilerPath = ""; /** * The number of the device which should be used by the * KernelLauncher */ private int deviceNumber = 0; /** * Set the path to the NVCC compiler. For example: <br /> * <code>setCompilerPath("C:/CUDA/bin");</code> * <br /> * By default, this path is empty, assuming that the compiler * is in a path that is visible via an environment variable. * * @param path The path to the NVCC compiler. */ public static void setCompilerPath(String path) { if (path == null) { compilerPath = ""; } compilerPath = path; if (!compilerPath.endsWith(File.separator)) { compilerPath += File.separator; } } /** * Set the number (index) of the device which should be used * by the KernelLauncher * * @param number The number of the device to use * @throws CudaException If number < 0 or number >= deviceCount */ public void setDeviceNumber(int number) { int count[] = new int[1]; cuDeviceGetCount(count); if (number < 0) { throw new CudaException("Invalid device number: " + number + ". " + "There are only " + count[0] + " devices available"); } deviceNumber = number; } /** * Create a new KernelLauncher for the function with the given * name, that is defined in the given source code. <br /> * <br /> * The source code is stored in a temporary .CU CUDA source file, * and a PTX file is compiled from this source file using the * NVCC (NVIDIA CUDA C Compiler) in a separate process. * The optional nvccArguments are passed to the NVCC.<br /> * <br /> * The NVCC has to be in a visible directory. E.g. for Windows, the * NVCC.EXE has to be in a directory that is contained in the PATH * environment variable. Alternatively, the path to the NVCC may * be specified by calling {@link KernelLauncher#setCompilerPath(String)} * with the respective path. <br /> * <br /> * <u><b>Note</b></u>: In order to make the function accessible * by the name it has in the source code, the function has to * be declared as an <code><u>extern "C"</u></code> function: <br /> * <br /> * <code> * <b>extern "C"</b><br /> * __global__ void functionName(...)<br /> * {<br /> * ... <br /> * }<br /> * </code> * * * @see KernelLauncher#create(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * * @param sourceCode The source code containing the function * @param functionName The name of the function. * @param nvccArguments Optional arguments for the NVCC * @return The KernelLauncher for the specified function * @throws CudaException If the creation of the CU- or PTX file * fails, or the PTX may not be loaded, or the specified * function can not be obtained. */ public static KernelLauncher compile(String sourceCode, String functionName, String... nvccArguments) { File cuFile = null; try { cuFile = File.createTempFile("temp_JCuda_", ".cu"); } catch (IOException e) { throw new CudaException("Could not create temporary .cu file", e); } String cuFileName = cuFile.getPath(); FileOutputStream fos = null; try { fos = new FileOutputStream(cuFile); fos.write(sourceCode.getBytes()); } catch (IOException e) { throw new CudaException("Could not write temporary .cu file", e); } finally { if (fos != null) { try { fos.close(); } catch (IOException e) { throw new CudaException("Could not close temporary .cu file", e); } } } return create(cuFileName, functionName, nvccArguments); } public void setModule(CUmodule module) { this.module = module; } /** * Create a new KernelLauncher for the function with the given * name, that is contained in the .CU CUDA source file with the * given name. <br /> * <br /> * <u><b>Note</b></u>: In order to make the function accessible * by the name it has in the source code, the function has to * be declared as an <code><u>extern "C"</u></code> function: <br /> * <br /> * <code> * <b>extern "C"</b><br /> * __global__ void functionName(...)<br /> * {<br /> * ... <br /> * }<br /> * </code> * <br /> * The extension of the given file name is replaced with "ptx". * If the PTX file with the resulting name does not exist, * or is older than the .CU file, it is compiled from * the specified source file using the NVCC (NVIDIA CUDA C * Compiler) in a separate process. The optional nvccArguments * are passed to the NVCC.<br /> * <br /> * The NVCC has to be in a visible directory. E.g. for Windows, the * NVCC.EXE has to be in a directory that is contained in the PATH * environment variable. Alternatively, the path to the NVCC may * be specified by calling {@link KernelLauncher#setCompilerPath(String)} * with the respective path. <br /> * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * @see KernelLauncher#load(InputStream, String) * * @param cuFileName The name of the source file. * @param functionName The name of the function. * @param nvccArguments Optional arguments for the NVCC * @return The KernelLauncher for the specified function * @throws CudaException If the creation of the PTX file fails, * or the PTX may not be loaded, or the specified function can * not be obtained. */ public static KernelLauncher create(String cuFileName, String functionName, String... nvccArguments) { return create(cuFileName, functionName, false, nvccArguments); } /** * Create a new KernelLauncher for the function with the given * name, that is contained in the .CU CUDA source file with the * given name. <br /> * <br /> * <u><b>Note</b></u>: In order to make the function accessible * by the name it has in the source code, the function has to * be declared as an <code><u>extern "C"</u></code> function: <br /> * <br /> * <code> * <b>extern "C"</b><br /> * __global__ void functionName(...)<br /> * {<br /> * ... <br /> * }<br /> * </code> * <br /> * The extension of the given file name is replaced with "ptx". * If the PTX file with the resulting name does not exist, * or is older than the .CU file, it is compiled from * the specified source file using the NVCC (NVIDIA CUDA C * Compiler) in a separate process. The optional nvccArguments * are passed to the NVCC.<br /> * <br /> * If the <code>forceRebuild</code> flag is 'true', then the * PTX file will be recompiled from the given source file, * even if it already existed or was newer than the source * file, and the already existing PTX file will be * overwritten.<br /> * <br /> * The NVCC has to be in a visible directory. E.g. for Windows, the * NVCC.EXE has to be in a directory that is contained in the PATH * environment variable. Alternatively, the path to the NVCC may * be specified by calling {@link KernelLauncher#setCompilerPath(String)} * with the respective path. <br /> * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, String...) * @see KernelLauncher#load(InputStream, String) * * @param cuFileName The name of the source file. * @param functionName The name of the function. * @param forceRebuild Whether the PTX file should be recompiled * and overwritten if it already exists. * @param nvccArguments Optional arguments for the NVCC * @return The KernelLauncher for the specified function * @throws CudaException If the creation of the PTX file fails, * or the PTX may not be loaded, or the specified function can * not be obtained. */ public static KernelLauncher create(String cuFileName, String functionName, boolean forceRebuild, String... nvccArguments) { // Prepare the PTX file for the CU source file String ptxFileName = null; try { ptxFileName = preparePtxFile(cuFileName, forceRebuild, nvccArguments); } catch (IOException e) { throw new CudaException("Could not prepare PTX for source file '" + cuFileName + "'", e); } KernelLauncher kernelLauncher = new KernelLauncher(); byte ptxData[] = loadData(ptxFileName); kernelLauncher.initModule(ptxData); kernelLauncher.initFunction(functionName); return kernelLauncher; } /** * Create a new KernelLauncher which may be used to execute the * specified function which is loaded from the PTX- or CUBIN * (CUDA binary) file with the given name. * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * @see KernelLauncher#load(InputStream, String) * * @param functionName The name of the function * @return The KernelLauncher for the specified function * @throws CudaException If the PTX- or CUBIN may not be loaded, * or the specified function can not be obtained. */ public static KernelLauncher load(String functionName, CUmodule module) { ContextHolder.getInstance().setContext(); KernelLauncher kernelLauncher = new KernelLauncher(); kernelLauncher.setModule(module); kernelLauncher.initFunction(functionName); return kernelLauncher; } /** * Create a new KernelLauncher which may be used to execute the * specified function which is loaded from the PTX- or CUBIN * (CUDA binary) file with the given name. * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * @see KernelLauncher#load(InputStream, String) * * @param functionName The name of the function * @return The KernelLauncher for the specified function * @throws CudaException If the PTX- or CUBIN may not be loaded, * or the specified function can not be obtained. */ public static KernelLauncher load(String functionName, String type, CUmodule module) { KernelLauncher kernelLauncher = new KernelLauncher(); kernelLauncher.setModule(module); kernelLauncher.initFunction(FUNCTION_NAME + "_" + type); return kernelLauncher; } /** * Create a new KernelLauncher which may be used to execute the * specified function which is loaded from the PTX- or CUBIN * (CUDA binary) file with the given name. * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * @see KernelLauncher#load(InputStream, String) * * @param moduleFileName The name of the PTX- or CUBIN file * @param functionName The name of the function * @return The KernelLauncher for the specified function * @throws CudaException If the PTX- or CUBIN may not be loaded, * or the specified function can not be obtained. */ public static KernelLauncher load(String moduleFileName, String functionName, String type) { KernelLauncher kernelLauncher = new KernelLauncher(); try { kernelLauncher.initModule(ByteStreams.toByteArray(new FileInputStream(moduleFileName))); } catch (IOException e) { e.printStackTrace(); } kernelLauncher.initFunction(FUNCTION_NAME + "_" + type); return kernelLauncher; } /** * Create a new KernelLauncher which may be used to execute the * specified function which is loaded from the PTX- or CUBIN * data that is read from the given input stream. * * @see KernelLauncher#compile(String, String, String...) * @see KernelLauncher#create(String, String, boolean, String...) * @see KernelLauncher#load(InputStream, String) * * @param moduleInputStream The stream for the PTX- or CUBIN data * @param functionName The name of the function * @return The KernelLauncher for the specified function * @throws CudaException If the PTX- or CUBIN may not be loaded, * or the specified function can not be obtained. */ public static KernelLauncher load(InputStream moduleInputStream, String functionName) { KernelLauncher kernelLauncher = new KernelLauncher(); byte moduleData[] = loadData(moduleInputStream); kernelLauncher.initModule(moduleData); kernelLauncher.initFunction(functionName); return kernelLauncher; } /** * Load the data from the file with the given name and returns * it as a 0-terminated byte array * * @param fileName The name of the file * @return The data from the file */ private static byte[] loadData(String fileName) { InputStream inputStream = null; try { inputStream = new FileInputStream(new File(fileName)); return loadData(inputStream); } catch (FileNotFoundException e) { throw new CudaException("Could not open '" + fileName + "'", e); } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { throw new CudaException("Could not close '" + fileName + "'", e); } } } } /** * Reads the data from the given inputStream and returns it as * a 0-terminated byte array. * * @param inputStream The inputStream to read * @return The data from the inputStream */ private static byte[] loadData(InputStream inputStream) { ByteArrayOutputStream baos = null; try { baos = new ByteArrayOutputStream(); byte buffer[] = new byte[8192]; while (true) { int read = inputStream.read(buffer); if (read == -1) { break; } baos.write(buffer, 0, read); } baos.write('\0'); baos.flush(); return baos.toByteArray(); } catch (IOException e) { throw new CudaException("Could not load data", e); } finally { if (baos != null) { try { baos.close(); } catch (IOException e) { throw new CudaException("Could not close output", e); } } } } /** * The context which was used to create this instance */ private CUcontext context; /** * The module which contains the function */ private CUmodule module; /** * The function which is executed with this KernelLauncher */ private CUfunction function; /** * The current block size (number of threads per block) * which will be used for the function call. */ private dim3 blockSize = new dim3(1, 1, 1); /** * The current grid size (number of blocks per grid) * which will be used for the function call. */ private dim3 gridSize = new dim3(1, 1, 1); /** * The currently specified size of the shared memory * for the function call. */ private int sharedMemSize = 0; /** * The stream that should be associated with the function call. */ private CUstream stream; /** * Private constructor. Instantiation only via the static * methods. */ private KernelLauncher() { initialize(); } /** * Initializes this KernelLauncher. This method will try to * initialize the JCuda driver API. Then it will try to * attach to the current CUDA context. If no active CUDA * context exists, then it will try to create one, for * the device which is specified by the current * deviceNumber. * * @throws CudaException If it is neither possible to * attach to an existing context, nor to create a new * context. */ private void initialize() { context = ContextHolder.getInstance().getContext(deviceNumber, Thread.currentThread().getName()); } /** * Create a new KernelLauncher which uses the same module as * this KernelLauncher, but may be used to execute a different * function. All parameters (grid size, block size, shared * memory size and stream) of the returned KernelLauncher * will be independent of 'this' one and initially contain * the default values. * * @param functionName The name of the function * @return The KernelLauncher for the specified function * @throws CudaException If the specified function can not * be obtained from the module of this KernelLauncher. */ public KernelLauncher forFunction(String functionName) { KernelLauncher kernelLauncher = new KernelLauncher(); kernelLauncher.module = this.module; kernelLauncher.initFunction(functionName); return kernelLauncher; } /** * Initialize the module for this KernelLauncher by loading * the PTX- or CUBIN file with the given name. * * @param path The data from the PTX- or CUBIN file */ private void loadModuleData(String path) { module = new CUmodule(); try { InputStream fis = new FileInputStream(path); JCudaDriver.cuModuleLoadData(module, ByteStreams.toByteArray(fis)); } catch (Exception e) { e.printStackTrace(); } } /** * Initialize the module for this KernelLauncher by loading * the PTX- or CUBIN file with the given name. * * @param moduleData The data from the PTX- or CUBIN file */ private void initModule(Pointer moduleData) { module = new CUmodule(); cuModuleLoadDataEx(module, moduleData, 0, new int[0], Pointer.to(new int[0])); } /** * Initialize the module for this KernelLauncher by loading * the PTX- or CUBIN file with the given name. * * @param fileName The data from the PTX- or CUBIN file */ private void initModule(String fileName) { module = new CUmodule(); cuModuleLoad(module, fileName); } /** * Initialize the module for this KernelLauncher by loading * the PTX- or CUBIN file with the given name. * * @param moduleData The data from the PTX- or CUBIN file */ private void initModule(byte moduleData[]) { module = new CUmodule(); cuModuleLoadData(module, moduleData); } /** * Initialize this KernelLauncher for calling the function with * the given name, which is contained in the module of this * KernelLauncher * * @param functionName The name of the function */ private void initFunction(String functionName) { // Obtain the function from the module function = new CUfunction(); String functionErrorString = "Could not get function '" + functionName + "' from module " + functionName + " " + "\n" + "Name in module might be mangled. Try adding the line " + "\n" + "extern \"C\"" + "\n" + "before the function you want to call, or open the " + "PTX/CUBIN " + "\n" + "file with a text editor to find out " + "the mangled function name"; try { int result = cuModuleGetFunction(function, module, functionName); if (result != CUresult.CUDA_SUCCESS) throw new CudaException(functionErrorString); } catch (CudaException e) { throw new CudaException(functionErrorString, e); } } /** * Returns the module that was created from the PTX- or CUBIN file, and * which contains the function that should be executed. This * module may also be used to access symbols and texture * references. However, clients should not modify or unload * the module. * * @return The CUmodule */ public CUmodule getModule() { return module; } /** * Set the grid size (number of blocks per grid) for the function * call.<br /> * <br /> * This corresponds to the first parameter in the runtime call:<br /> * <br /> * <code> * kernel<<<<b><u>gridSize</u></b>, blockSize, * sharedMemSize, stream>>>(...); * </code> * <br /> * <br /> * The default grid size is (1,1,1) * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param x The number of blocks per grid in x-direction * @param y The number of blocks per grid in y-direction * @return This instance */ public KernelLauncher setGridSize(int x, int y) { gridSize.x = x; gridSize.y = y; return this; } /** * Set the grid size (number of blocks per grid) for the function * call.<br /> * <br /> * This corresponds to the first parameter in the runtime call:<br /> * <br /> * <code> * kernel<<<<b><u>gridSize</u></b>, blockSize, * sharedMemSize, stream>>>(...); * </code> * <br /> * <br /> * The default grid size is (1,1,1) * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param x The number of blocks per grid in x-direction * @param y The number of blocks per grid in y-direction * @param z The number of blocks per grid in z-direction * @return This instance */ public KernelLauncher setGridSize(int x, int y, int z) { gridSize.x = x; gridSize.y = y; gridSize.z = z; return this; } /** * Set the block size (number of threads per block) for the function * call.<br /> * <br /> * This corresponds to the second parameter in the runtime call:<br /> * <br /> * <code> * kernel<<<gridSize, <b><u>blockSize</u></b>, * sharedMemSize, stream>>>(...); * </code> * <br /> * <br /> * The default block size is (1,1,1) * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param x The number of threads per block in x-direction * @param y The number of threads per block in y-direction * @param z The number of threads per block in z-direction * @return This instance */ public KernelLauncher setBlockSize(int x, int y, int z) { blockSize.x = x; blockSize.y = y; blockSize.z = z; return this; } /** * Set the size of the shared memory for the function * call.<br /> * <br /> * This corresponds to the third parameter in the runtime call:<br /> * <br /> * <code> * kernel<<<gridSize, blockSize, * <b><u>sharedMemSize</u></b>, stream>>>(...); * </code> * <br /> * <br /> * The default shared memory size is 0. * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param sharedMemSize The size of the shared memory, in bytes * @return This instance */ public KernelLauncher setSharedMemSize(int sharedMemSize) { this.sharedMemSize = sharedMemSize; return this; } /** * Set the stream for the function call.<br /> * <br /> * This corresponds to the fourth parameter in the runtime call:<br /> * <br /> * <code> * kernel<<<gridSize, blockSize, * sharedMemSize, <b><u>stream</u></b>>>>(...); * </code> * <br /> * <br /> * The default stream is null (0). * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param stream The stream for the function call * @return This instance */ public KernelLauncher setStream(CUstream stream) { this.stream = stream; return this; } /** * Set the given grid size and block size for this KernelLauncher. * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3, int) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param gridSize The grid size (number of blocks per grid) * @param blockSize The block size (number of threads per block) * @return This instance */ public KernelLauncher setup(dim3 gridSize, dim3 blockSize) { return setup(gridSize, blockSize, sharedMemSize, stream); } /** * Set the given grid size and block size and shared memory size * for this KernelLauncher. * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int, CUstream) * * @param gridSize The grid size (number of blocks per grid) * @param blockSize The block size (number of threads per block) * @param sharedMemSize The size of the shared memory * @return This instance */ public KernelLauncher setup(dim3 gridSize, dim3 blockSize, int sharedMemSize) { return setup(gridSize, blockSize, sharedMemSize, stream); } public CUcontext context() { return context; } /** * Set the given grid size and block size, shared memory size * and stream for this KernelLauncher. * * @see KernelLauncher#call(Object...) * @see KernelLauncher#setup(dim3, dim3) * @see KernelLauncher#setup(dim3, dim3, int) * * @param gridSize The grid size (number of blocks per grid) * @param blockSize The block size (number of threads per block) * @param sharedMemSize The size of the shared memory * @param stream The stream for the kernel invocation * @return This instance */ public KernelLauncher setup(dim3 gridSize, dim3 blockSize, int sharedMemSize, CUstream stream) { setGridSize(gridSize.x, gridSize.y); setBlockSize(blockSize.x, blockSize.y, blockSize.z); setSharedMemSize(sharedMemSize); setStream(stream); return this; } /** * Call the function of this KernelLauncher with the current * grid size, block size, shared memory size and stream, and * with the given arguments.<br /> * <br /> * The given arguments must all be either of the type * <code>Pointer</code>, or of a primitive type except boolean. * Otherwise, a CudaException will be thrown. * * @param args The arguments for the function call * @throws CudaException if an argument with an invalid type * was given, or one of the internal functions for setting * up and executing the kernel failed. */ public void call(Object... args) { Pointer kernelParameters[] = new Pointer[args.length]; for (int i = 0; i < args.length; i++) { Object arg = args[i]; if (arg instanceof Pointer) { Pointer argPointer = (Pointer) arg; Pointer pointer = Pointer.to(argPointer); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Pointer"); } else if (arg instanceof Byte) { Byte value = (Byte) arg; Pointer pointer = Pointer.to(new byte[] { value }); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Byte"); } else if (arg instanceof Short) { Short value = (Short) arg; Pointer pointer = Pointer.to(new short[] { value }); kernelParameters[i] = pointer; // logger.info("argument " + i + " type is Short"); } else if (arg instanceof Integer) { Integer value = (Integer) arg; Pointer pointer = Pointer.to(new int[] { value }); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Integer"); } else if (arg instanceof Long) { Long value = (Long) arg; Pointer pointer = Pointer.to(new long[] { value }); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Long"); } else if (arg instanceof Float) { Float value = (Float) arg; Pointer pointer = Pointer.to(new float[] { value }); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Float"); } else if (arg instanceof Double) { Double value = (Double) arg; Pointer pointer = Pointer.to(new double[] { value }); kernelParameters[i] = pointer; //logger.info("argument " + i + " type is Double"); } /** * Of note here. double[] of length 1 is * passed to the cuda kernel as a direct double. * Eg: double * Rather than double * * If it's actually, a buffer, we need to ensure * data is allocated on the gpu (hence why we throw the exception) * * This is applicable for any numerical primitive array (the below few) */ else if (arg instanceof double[]) { double[] d = (double[]) arg; if (d.length == 1) kernelParameters[i] = Pointer.to(d); else throw new IllegalArgumentException( "Please wrap double arrays in a buffer with kernelfunctions.alloc()"); } else if (arg instanceof float[]) { float[] f = (float[]) arg; if (f.length == 1) kernelParameters[i] = Pointer.to(f); else throw new IllegalArgumentException( "Please wrap float arrays in a buffer with kernelfunctions.alloc()"); } else if (arg instanceof int[]) { int[] i2 = (int[]) arg; if (i2.length == 1) kernelParameters[i] = Pointer.to(i2); else throw new IllegalArgumentException( "Please wrap int arrays in a buffer with kernelfunctions.alloc()"); } else if (arg instanceof jcuda.jcurand.curandGenerator) { jcuda.jcurand.curandGenerator rng = (jcuda.jcurand.curandGenerator) arg; kernelParameters[i] = Pointer.to(rng); } else { throw new CudaException("Type " + arg.getClass() + " may not be passed to a function"); } } cuLaunchKernel(function, gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z, sharedMemSize, stream, Pointer.to(kernelParameters), null); } public CUfunction getFunction() { return function; } /** * The extension of the given file name is replaced with "ptx". * If the file with the resulting name does not exist or is older * than the source file, it is compiled from the given file * using NVCC. If the forceRebuild flag is 'true', then the PTX * file is rebuilt even if it already exists or is newer than the * source file. The name of the PTX file is returned. * * @param cuFileName The name of the .CU file * @param forceRebuild Whether the PTX file should be re-created * even if it exists already. * @param nvccArguments Optional arguments for the NVCC * @return The name of the PTX file * @throws IOException If an I/O error occurs * @throws CudaException If the creation of the PTX file fails */ private static String preparePtxFile(String cuFileName, boolean forceRebuild, String... nvccArguments) throws IOException { logger.info("Preparing PTX for \n" + cuFileName); File cuFile = new File(cuFileName); if (!cuFile.exists()) throw new CudaException("Input file not found: " + cuFileName); // Replace the file extension with "ptx" String ptxFileName = null; int lastIndex = cuFileName.lastIndexOf('.'); if (lastIndex == -1) ptxFileName = cuFileName + ".ptx"; else ptxFileName = cuFileName.substring(0, lastIndex) + ".ptx"; // Return if the file already exists and should not be rebuilt File ptxFile = new File(ptxFileName); if (ptxFile.exists() && !forceRebuild) { long cuLastModified = cuFile.lastModified(); long ptxLastModified = ptxFile.lastModified(); if (cuLastModified < ptxLastModified) return ptxFileName; } // Build the command line String modelString = "-m" + System.getProperty("sun.arch.data.model"); String defaultArguments = ""; String optionalArguments = createArgumentsString(nvccArguments); String command = compilerPath + "nvcc " + modelString + " " + defaultArguments + " " + optionalArguments + " -ptx " + cuFile.getPath() + " -o " + ptxFileName; // Execute the command line and wait for the output logger.info("Executing\n" + command); Process process = Runtime.getRuntime().exec(command); String errorMessage = new String(toByteArray(process.getErrorStream())); String outputMessage = new String(toByteArray(process.getInputStream())); int exitValue = 0; try { exitValue = process.waitFor(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CudaException("Interrupted while waiting for nvcc output", e); } logger.info("nvcc process exitValue " + exitValue); if (exitValue != 0) { logger.error("errorMessage:\n" + errorMessage); logger.error("outputMessage:\n" + outputMessage); throw new CudaException("Could not create .ptx file: " + errorMessage); } return ptxFileName; } /** * Creates a single string from the given argument strings * * @param nvccArguments The argument strings * @return A single string containing the arguments */ private static String createArgumentsString(String... nvccArguments) { if (nvccArguments == null || nvccArguments.length == 0) { return ""; } StringBuilder sb = new StringBuilder(); for (String s : nvccArguments) { sb.append(s); sb.append(" "); } return sb.toString(); } /** * Fully reads the given InputStream and returns it as a byte array. * * @param inputStream The input stream to read * @return The byte array containing the data from the input stream * @throws IOException If an I/O error occurs */ private static byte[] toByteArray(InputStream inputStream) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte buffer[] = new byte[8192]; while (true) { int read = inputStream.read(buffer); if (read == -1) { break; } baos.write(buffer, 0, read); } return baos.toByteArray(); } @Override public String toString() { return "KernelLauncher{" + "deviceNumber=" + deviceNumber + ", context=" + context + ", module=" + module + ", function=" + function + ", blockSize=" + blockSize + ", gridSize=" + gridSize + ", sharedMemSize=" + sharedMemSize + ", stream=" + stream + '}'; } }