Source code for scikits.cuda.misc

#!/usr/bin/env python

"""
Miscellaneous PyCUDA functions.
"""

from __future__ import absolute_import

from string import Template
import atexit

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.elementwise as elementwise
import pycuda.reduction as reduction
import pycuda.scan as scan
import pycuda.tools as tools
from pycuda.compiler import SourceModule
from pytools import memoize
import numpy as np

from . import cuda
from . import cublas

try:
    from . import cula
    _has_cula = True
except (ImportError, OSError):
    _has_cula = False

isdoubletype = lambda x : True if x == np.float64 or \
               x == np.complex128 else False
isdoubletype.__doc__ = """
Check whether a type has double precision.

Parameters
----------
t : numpy float type
    Type to test.

Returns
-------
result : bool
    Result.
"""

iscomplextype = lambda x : True if x == np.complex64 or \
                x == np.complex128 else False
iscomplextype.__doc__ = """
Check whether a type is complex.

Parameters
----------
t : numpy float type
    Type to test.

Returns
-------
result : bool
    Result.

"""
    
[docs]def init_device(n=0): """ Initialize a GPU device. Initialize a specified GPU device rather than the default device found by `pycuda.autoinit`. Parameters ---------- n : int Device number. Returns ------- dev : pycuda.driver.Device Initialized device. """ drv.init() dev = drv.Device(n) return dev
[docs]def init_context(dev): """ Create a context that will be cleaned up properly. Create a context on the specified device and register its pop() method with atexit. Parameters ---------- dev : pycuda.driver.Device GPU device. Returns ------- ctx : pycuda.driver.Context Created context. """ ctx = dev.make_context() atexit.register(ctx.pop) return ctx
[docs]def done_context(ctx): """ Detach from a context cleanly. Detach from a context and remove its pop() from atexit. Parameters ---------- ctx : pycuda.driver.Context Context from which to detach. """ for i in xrange(len(atexit._exithandlers)): if atexit._exithandlers[i][0] == ctx.pop: del atexit._exithandlers[i] break ctx.detach()
global _global_cublas_handle _global_cublas_handle = None global _global_cublas_allocator _global_cublas_allocator = None
[docs]def init(allocator=drv.mem_alloc): """ Initialize libraries used by scikits.cuda. Initialize the CUBLAS and CULA libraries used by high-level functions provided by scikits.cuda. Parameters ---------- allocator : an allocator used internally by some of the high-level functions. Notes ----- This function does not initialize PyCUDA; it uses whatever device and context were initialized in the current host thread. """ # CUBLAS uses whatever device is being used by the host thread: global _global_cublas_handle, _global_cublas_allocator if not _global_cublas_handle: _global_cublas_handle = cublas.cublasCreate() if _global_cublas_allocator is None: _global_cublas_allocator = allocator # culaSelectDevice() need not (and, in fact, cannot) be called # here because the host thread has already been bound to a GPU # device: if _has_cula: cula.culaInitialize()
[docs]def shutdown(): """ Shutdown libraries used by scikits.cuda. Shutdown the CUBLAS and CULA libraries used by high-level functions provided by scikits.cuda. Notes ----- This function does not shutdown PyCUDA. """ global _global_cublas_handle if _global_cublas_handle: cublas.cublasDestroy(_global_cublas_handle) _global_cublas_handle = None if _has_cula: cula.culaShutdown()
[docs]def get_compute_capability(dev): """ Get the compute capability of the specified device. Retrieve the compute capability of the specified CUDA device and return it as a floating point value. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- c : float Compute capability. """ return np.float('.'.join([str(i) for i in dev.compute_capability()]))
[docs]def get_current_device(): """ Get the device in use by the current context. Returns ------- d : pycuda.driver.Device Device in use by current context. """ return drv.Device(cuda.cudaGetDevice())
@memoize def get_dev_attrs(dev): """ Get select CUDA device attributes. Retrieve select attributes of the specified CUDA device that relate to maximum thread block and grid sizes. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- attrs : list List containing [MAX_THREADS_PER_BLOCK, (MAX_BLOCK_DIM_X, MAX_BLOCK_DIM_Y, MAX_BLOCK_DIM_Z), (MAX_GRID_DIM_X, MAX_GRID_DIM_Y, MAX_GRID_DIM_Z)] """ attrs = dev.get_attributes() return [attrs[drv.device_attribute.MAX_THREADS_PER_BLOCK], (attrs[drv.device_attribute.MAX_BLOCK_DIM_X], attrs[drv.device_attribute.MAX_BLOCK_DIM_Y], attrs[drv.device_attribute.MAX_BLOCK_DIM_Z]), (attrs[drv.device_attribute.MAX_GRID_DIM_X], attrs[drv.device_attribute.MAX_GRID_DIM_Y], attrs[drv.device_attribute.MAX_GRID_DIM_Z])] iceil = lambda n: int(np.ceil(n)) @memoize def select_block_grid_sizes(dev, data_shape, threads_per_block=None): """ Determine CUDA block and grid dimensions given device constraints. Determine the CUDA block and grid dimensions allowed by a GPU device that are sufficient for processing every element of an array in a separate thread. Parameters ---------- d : pycuda.driver.Device Device object to be used. data_shape : tuple Shape of input data array. Must be of length 2. threads_per_block : int, optional Number of threads to execute in each block. If this is None, the maximum number of threads per block allowed by device `d` is used. Returns ------- block_dim : tuple X, Y, and Z dimensions of minimal required thread block. grid_dim : tuple X and Y dimensions of minimal required block grid. Notes ----- Using the scheme in this function, all of the threads in the grid can be enumerated as `i = blockIdx.y*max_threads_per_block*max_blocks_per_grid+ blockIdx.x*max_threads_per_block+threadIdx.x`. For 2D shapes, the subscripts of the element `data[a, b]` where `data.shape == (A, B)` can be computed as `a = i/B` `b = mod(i,B)`. For 3D shapes, the subscripts of the element `data[a, b, c]` where `data.shape == (A, B, C)` can be computed as `a = i/(B*C)` `b = mod(i, B*C)/C` `c = mod(mod(i, B*C), C)`. For 4D shapes, the subscripts of the element `data[a, b, c, d]` where `data.shape == (A, B, C, D)` can be computed as `a = i/(B*C*D)` `b = mod(i, B*C*D)/(C*D)` `c = mod(mod(i, B*C*D)%(C*D))/D` `d = mod(mod(mod(i, B*C*D)%(C*D)), D)` It is advisable that the number of threads per block be a multiple of the warp size to fully utilize a device's computing resources. """ # Sanity checks: if np.isscalar(data_shape): data_shape = (data_shape,) # Number of elements to process; we need to cast the result of # np.prod to a Python int to prevent PyCUDA's kernel execution # framework from getting confused when N = int(np.prod(data_shape)) # Get device constraints: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) if threads_per_block is not None: if threads_per_block > max_threads_per_block: raise ValueError('threads per block exceeds device maximum') else: max_threads_per_block = threads_per_block # Actual number of thread blocks needed: blocks_needed = iceil(N/float(max_threads_per_block)) if blocks_needed <= max_grid_dim[0]: return (max_threads_per_block, 1, 1), (blocks_needed, 1, 1) elif blocks_needed > max_grid_dim[0] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], iceil(blocks_needed/float(max_grid_dim[0])), 1) elif blocks_needed > max_grid_dim[0]*max_grid_dim[1] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]*max_grid_dim[2]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], max_grid_dim[1], iceil(blocks_needed/float(max_grid_dim[0]*max_grid_dim[1]))) else: raise ValueError('array size too large')
[docs]def zeros(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with zeros. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the given shape and dtype. Notes ----- This function exists to work around the following numpy bug that prevents pycuda.gpuarray.zeros() from working properly with complex types in pycuda 2011.1.2: http://projects.scipy.org/numpy/ticket/1898 """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(0) return out
[docs]def zeros_like(a): """ Return an array of zeros with the same shape and type as a given array. Parameters ---------- a : array_like The shape and data type of `a` determine the corresponding attributes of the returned array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the shape and dtype of `a`. """ out = gpuarray.GPUArray(a.shape, a.dtype, drv.mem_alloc) out.fill(0) return out
[docs]def ones(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with ones. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the given shape and dtype. """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(1) return out
[docs]def ones_like(other): """ Return an array of ones with the same shape and type as a given array. Parameters ---------- other : pycuda.gpuarray.GPUArray Array whose shape and dtype are to be used to allocate a new array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the shape and dtype of `other`. """ out = gpuarray.GPUArray(other.shape, other.dtype, other.allocator) out.fill(1) return out
[docs]def inf(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with infs. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of infs with the given shape and dtype. """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(np.inf) return out
[docs]def maxabs(x_gpu): """ Get maximum absolute value. Find maximum absolute value in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- m_gpu : pycuda.gpuarray.GPUArray Array containing maximum absolute value in `x_gpu`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.array([-1, 2, -3], np.float32)) >>> m_gpu = misc.maxabs(x_gpu) >>> np.allclose(m_gpu.get(), 3.0) True """ try: func = maxabs.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) use_double = int(x_gpu.dtype in [np.float64, np.complex128]) ret_type = np.float64 if use_double else np.float32 func = reduction.ReductionKernel(ret_type, neutral="0", reduce_expr="max(a,b)", map_expr="abs(x[i])", arguments="{ctype} *x".format(ctype=ctype)) maxabs.cache[x_gpu.dtype] = func return func(x_gpu)
maxabs.cache = {}
[docs]def cumsum(x_gpu): """ Cumulative sum. Return the cumulative sum of the elements in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Output array containing cumulative sum of `x_gpu`. Notes ----- Higher dimensional arrays are implicitly flattened row-wise by this function. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.random.rand(5).astype(np.float32)) >>> c_gpu = misc.cumsum(x_gpu) >>> np.allclose(c_gpu.get(), np.cumsum(x_gpu.get())) True """ try: func = cumsum.cache[x_gpu.dtype] except KeyError: func = scan.InclusiveScanKernel(x_gpu.dtype, 'a+b', preamble='#include <pycuda-complex.hpp>') cumsum.cache[x_gpu.dtype] = func return func(x_gpu)
cumsum.cache = {}
[docs]def diff(x_gpu): """ Calculate the discrete difference. Calculates the first order difference between the successive entries of a vector. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input vector. Returns ------- y_gpu : pycuda.gpuarray.GPUArray Discrete difference. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = misc.diff(x_gpu) >>> np.allclose(np.diff(x), y_gpu.get()) True """ y_gpu = gpuarray.empty(len(x_gpu)-1, x_gpu.dtype) try: func = diff.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = elementwise.ElementwiseKernel("{ctype} *a, {ctype} *b".format(ctype=ctype), "b[i] = a[i+1]-a[i]") diff.cache[x_gpu.dtype] = func func(x_gpu, y_gpu) return y_gpu
diff.cache = {} # List of available numerical types provided by numpy: num_types = [np.typeDict[t] for t in \ np.typecodes['AllInteger']+np.typecodes['AllFloat']] # Numbers of bytes occupied by each numerical type: num_nbytes = dict((np.dtype(t),t(1).nbytes) for t in num_types)
[docs]def set_realloc(x_gpu, data): """ Transfer data into a GPUArray instance. Copies the contents of a numpy array into a GPUArray instance. If the array has a different type or dimensions than the instance, the GPU memory used by the instance is reallocated and the instance updated appropriately. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. data : numpy.ndarray Array of data to transfer to the GPU. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> x = np.asarray(np.random.rand(10, 1), np.float64) >>> set_realloc(x_gpu, x) >>> np.allclose(x, x_gpu.get()) True """ # Only reallocate if absolutely necessary: if x_gpu.shape != data.shape or x_gpu.size != data.size or \ x_gpu.strides != data.strides or x_gpu.dtype != data.dtype: # Free old memory: x_gpu.gpudata.free() # Allocate new memory: nbytes = num_nbytes[data.dtype] x_gpu.gpudata = drv.mem_alloc(nbytes*data.size) # Set array attributes: x_gpu.shape = data.shape x_gpu.size = data.size x_gpu.strides = data.strides x_gpu.dtype = data.dtype # Update the GPU memory: x_gpu.set(data)
if __name__ == "__main__": import doctest doctest.testmod()