Source code for scikits.cuda.linalg

#!/usr/bin/env python

"""
PyCUDA-based linear algebra functions.
"""

from __future__ import absolute_import

from pprint import pprint
from string import Template
from pycuda.compiler import SourceModule

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.elementwise as el
import pycuda.tools as tools
import numpy as np

from . import cublas
from . import misc

import sys
if sys.version_info < (3,):
    range = xrange


class LinAlgError(Exception):
    """Linear Algebra Error."""
    pass


try:
    from . import cula
    _has_cula = True
except (ImportError, OSError):
    _has_cula = False

from .misc import init

# Get installation location of C headers:
from . import install_headers

[docs]def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementError('CULA not installed') alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula._libcula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type, allocator=alloc) # Set the leading dimension and allocate u: jobu = jobu.upper() jobvt = jobvt.upper() ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type, allocator=alloc) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type, allocator=alloc) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type, allocator=alloc) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type, allocator=alloc) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type, allocator=alloc) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type, allocator=alloc) # Compute SVD and check error status: status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu
[docs]def cho_factor(a_gpu, uplo='L'): """ Cholesky factorisation Performs an in-place cholesky factorisation on the matrix `a` such that `a = x*x.T` or `x.T*x`, if the lower='L' or upper='U' triangle of `a` is used, respectively. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: pycuda.gpuarray.GPUArray Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.linalg >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> cho_factor(a_gpu) >>> np.allclose(a_gpu.get(), scipy.linalg.cho_factor(a)[0]) True """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSpotrf elif data_type == np.complex128: cula_func = cula._libcula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape if (n!=m): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, m) status = cula_func(uplo, n, int(a_gpu.gpudata), lda) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # In-place operation. No return matrix. Result is stored in the input matrix.
[docs]def cho_solve(a_gpu, b_gpu, uplo='L'): """ Cholesky solver Solve a system of equations via cholesky factorization, i.e. `a*x = b`. Overwrites `b` to give `inv(a)*b`, and overwrites the chosen triangle of `a` with factorized triangle Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. b : pycuda.gpuarray.GPUArray Input matrix of shape `(m, 1)` to decompose. uplo: chr use the upper='U' or lower='L' (default) triangle of `a`. Returns ------- a: pycuda.gpuarray.GPUArray Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.linalg >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> b = np.array([11.,19.]) >>> b = np.asarray(b, np.float64) >>> b_gpu = gpuarray.to_gpu(b) >>> cho_solve(a_gpu,b_gpu) >>> np.allclose(b_gpu.get(), scipy.linalg.cho_solve(scipy.linalg.cho_factor(a), b)) True """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula._libcula.culaDeviceCposv elif data_type == np.float32: cula_func = cula._libcula.culaDeviceSposv elif data_type == np.complex128: cula_func = cula._libcula.culaDeviceZposv elif data_type == np.float64: cula_func = cula._libcula.culaDeviceDposv else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: na, ma = a_gpu.shape if (na!=ma): raise ValueError('Matrix must be symmetric positive-definite') if a_gpu.flags.c_contiguous != b_gpu.flags.c_contiguous: raise ValueError('unsupported combination of input order') b_shape = b_gpu.shape if len(b_shape) == 1: b_shape = (b_shape[0], 1) if a_gpu.flags.f_contiguous: lda = max(1, na) ldb = max(1, b_shape[0]) else: lda = max(1, ma) ldb = lda if b_shape[1] > 1: raise ValueError('only vectors allowed in c-order RHS') # Assuming we are only solving for a vector. Hence, nrhs = 1 status = cula_func(uplo, na, b_shape[1], int(a_gpu.gpudata), lda, int(b_gpu.gpudata), ldb) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers() # In-place operation. No return matrix. Result is stored in the input matrix # and in the input vector.
[docs]def dot(x_gpu, y_gpu, transa='N', transb='N', handle=None, out=None): """ Dot product of two arrays. For 1D arrays, this function computes the inner product. For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Input array. transa : char If 'T', compute the product of the transpose of `x_gpu`. If 'C', compute the product of the Hermitian of `x_gpu`. transb : char If 'T', compute the product of the transpose of `y_gpu`. If 'C', compute the product of the Hermitian of `y_gpu`. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. out : pycuda.gpuarray.GPUArray, optional Output argument. Will be used to store the result. Returns ------- c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128} Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D arrays, the result will be returned as a scalar. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> import misc >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = linalg.dot(a_gpu, b_gpu) >>> np.allclose(np.dot(a, b), c_gpu.get()) True >>> d = np.asarray(np.random.rand(5), np.float32) >>> e = np.asarray(np.random.rand(5), np.float32) >>> d_gpu = gpuarray.to_gpu(d) >>> e_gpu = gpuarray.to_gpu(e) >>> f = linalg.dot(d_gpu, e_gpu) >>> np.allclose(np.dot(d, e), f) True """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1: if x_gpu.size != y_gpu.size: raise ValueError('arrays must be of same length') # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas.cublasCdotu elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas.cublasSdot elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas.cublasZdotu elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas.cublasDdot else: raise ValueError('unsupported combination of input types') return cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) else: # Get the shapes of the arguments (accounting for the # possibility that one of them may only have one dimension): x_shape = x_gpu.shape y_shape = y_gpu.shape if len(x_shape) == 1: x_shape = (1, x_shape[0]) if len(y_shape) == 1: y_shape = (1, y_shape[0]) # Perform matrix multiplication for 2D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas.cublasCgemm alpha = np.complex64(1.0) beta = np.complex64(0.0) elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas.cublasSgemm alpha = np.float32(1.0) beta = np.float32(0.0) elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas.cublasZgemm alpha = np.complex128(1.0) beta = np.complex128(0.0) elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas.cublasDgemm alpha = np.float64(1.0) beta = np.float64(0.0) else: raise ValueError('unsupported combination of input types') transa = transa.lower() transb = transb.lower() if x_gpu.flags.c_contiguous != y_gpu.flags.c_contiguous: raise ValueError('unsupported combination of input order') if x_gpu.flags.f_contiguous: if transa in ['t', 'c']: k, m = x_shape elif transa in ['n']: m, k = x_shape else: raise ValueError('invalid value for transa') if transb in ['t', 'c']: n, l = y_shape elif transb in ['n']: l, n = y_shape else: raise ValueError('invalid value for transb') if l != k: raise ValueError('objects are not aligned') lda = max(1, x_shape[0]) ldb = max(1, y_shape[0]) ldc = max(1, m) if out is None: c_gpu = gpuarray.empty((m, n), x_gpu.dtype, order="F", allocator=alloc) else: if out.shape != (m, n) or out.dtype != x_gpu.dtype: raise ValueError('invalid value for out') if x_gpu.flags.f_contiguous != out.flags.f_contiguous: raise ValueError('invalid order for out') c_gpu = out cublas_func(handle, transa, transb, m, n, k, alpha, x_gpu.gpudata, lda, y_gpu.gpudata, ldb, beta, c_gpu.gpudata, ldc) else: if transb in ['t', 'c']: m, k = y_shape elif transb in ['n']: k, m = y_shape else: raise ValueError('invalid value for transb') if transa in ['t', 'c']: l, n = x_shape elif transa in ['n']: n, l = x_shape else: raise ValueError('invalid value for transa') if l != k: raise ValueError('objects are not aligned') lda = max(1, y_shape[1]) ldb = max(1, x_shape[1]) ldc = max(1, m) # Note that the desired shape of the output matrix is the transpose # of what CUBLAS assumes: if out is None: c_gpu = gpuarray.empty((n, ldc), x_gpu.dtype, allocator=alloc) else: if out.shape != (n, ldc) or out.dtype != x_gpu.dtype: raise ValueError('invalid value for out') if x_gpu.flags.f_contiguous != out.flags.f_contiguous: raise ValueError('invalid order for out') c_gpu = out cublas_func(handle, transb, transa, m, n, k, alpha, y_gpu.gpudata, lda, x_gpu.gpudata, ldb, beta, c_gpu.gpudata, ldc) return c_gpu
[docs]def mdot(*args, **kwargs): """ Product of several matrices. Computes the matrix product of several arrays of shapes. Parameters ---------- a_gpu, b_gpu, ... : pycuda.gpuarray.GPUArray Arrays to multiply. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Matrix product of `a_gpu`, `b_gpu`, etc. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> c = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = gpuarray.to_gpu(c) >>> d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) >>> np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) True """ if kwargs.has_key('handle') and kwargs['handle'] is not None: handle = kwargs['handle'] else: handle = misc._global_cublas_handle # Free the temporary matrix allocated when computing the dot # product: out_gpu = args[0] for next_gpu in args[1:]: temp_gpu = dot(out_gpu, next_gpu, handle=handle) out_gpu.gpudata.free() del(out_gpu) out_gpu = temp_gpu del(temp_gpu) return out_gpu
[docs]def dot_diag(d_gpu, a_gpu, trans='N', overwrite=True, handle=None): """ Dot product of diagonal and non-diagonal arrays. Computes the matrix product of a diagonal array represented as a vector and a non-diagonal array. Parameters ---------- d_gpu : pycuda.gpuarray.GPUArray Array of length `N` corresponding to the diagonal of the multiplier. a_gpu : pycuda.gpuarray.GPUArray Multiplicand array with shape `(N, M)`. trans : char If 'T', compute the product of the transpose of `a_gpu`. overwrite : bool If true (default), save the result in `a_gpu`. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Returns ------- r_gpu : pycuda.gpuarray.GPUArray The computed matrix product. Notes ----- `d_gpu` and `a_gpu` must have the same precision data type. `d_gpu` may be real and `a_gpu` may be complex, but not vice-versa. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> d = np.random.rand(4) >>> a = np.random.rand(4, 4) >>> d_gpu = gpuarray.to_gpu(d) >>> a_gpu = gpuarray.to_gpu(a) >>> r_gpu = linalg.dot_diag(d_gpu, a_gpu) >>> np.allclose(np.dot(np.diag(d), a), r_gpu.get()) True """ if handle is None: handle = misc._global_cublas_handle if len(d_gpu.shape) != 1: raise ValueError('d_gpu must be a vector') if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') trans = trans.lower() if trans == 'n': rows, cols = a_gpu.shape else: cols, rows = a_gpu.shape N = d_gpu.size if N != rows: raise ValueError('incompatible dimensions') float_type = a_gpu.dtype.type if float_type == np.float32: if d_gpu.dtype != np.float32: raise ValueError('precision of argument types must be the same') scal_func = cublas.cublasSscal copy_func = cublas.cublasScopy elif float_type == np.float64: if d_gpu.dtype != np.float64: raise ValueError('precision of argument types must be the same') scal_func = cublas.cublasDscal copy_func = cublas.cublasDcopy elif float_type == np.complex64: if d_gpu.dtype == np.complex64: scal_func = cublas.cublasCscal elif d_gpu.dtype == np.float32: scal_func = cublas.cublasCsscal else: raise ValueError('precision of argument types must be the same') copy_func = cublas.cublasCcopy elif float_type == np.complex128: if d_gpu.dtype == np.complex128: scal_func = cublas.cublasZscal elif d_gpu.dtype == np.float64: scal_func = cublas.cublasZdscal else: raise ValueError('precision of argument types must be the same') copy_func = cublas.cublasZcopy else: raise ValueError('unrecognized type') d = d_gpu.get() if overwrite: r_gpu = a_gpu else: r_gpu = gpuarray.empty((a_gpu.shape[0], a_gpu.shape[1]), a_gpu.dtype, allocator=alloc) copy_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(r_gpu.gpudata), 1) if (trans == 'n' and a_gpu.flags.c_contiguous) \ or (trans == 't' and a_gpu.flags.f_contiguous): incx = 1 bytes_step = cols*float_type().itemsize else: incx = rows bytes_step = float_type().itemsize for i in range(N): scal_func(handle, cols, d[i], int(r_gpu.gpudata)+i*bytes_step, incx) return r_gpu
[docs]def add_diag(d_gpu, a_gpu, overwrite=True, handle=None): """ Adds a vector to the diagonal of an array. This is the same as A + diag(D), but faster. Parameters ---------- d_gpu : pycuda.gpuarray.GPUArray Array of length `N` corresponding to the vector to be added to the diagonal. a_gpu : pycuda.gpuarray.GPUArray Summand array with shape `(N, N)`. overwrite : bool If true, save the result in `a_gpu` (default: True). handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Returns ------- r_gpu : pycuda.gpuarray.GPUArray The computed sum product. Notes ----- `d_gpu` and `a_gpu` must have the same precision data type. """ if handle is None: handle = misc._global_cublas_handle if not (len(d_gpu.shape) == 1 or (d_gpu.shape[0] == 1 or d_gpu.shape[1] == 1)): raise ValueError('d_gpu must be a vector') if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') if a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('a_gpu must be square') if d_gpu.size != a_gpu.shape[0]: raise ValueError('incompatible dimensions') if a_gpu.dtype != d_gpu.dtype: raise ValueError('precision of argument types must be the same') if (a_gpu.dtype == np.complex64): axpy = cublas.cublasCaxpy elif (a_gpu.dtype == np.float32): axpy = cublas.cublasSaxpy elif (a_gpu.dtype == np.complex128): axpy = cublas.cublasZaxpy elif (a_gpu.dtype == np.float64): axpy = cublas.cublasDaxpy else: raise ValueError('unsupported input type') if overwrite: r_gpu = a_gpu else: r_gpu = a_gpu.copy() n = a_gpu.shape[0] axpy(handle, n, 1.0, d_gpu.gpudata, int(1), r_gpu.gpudata, int(n+1)) return r_gpu
def _transpose(a_gpu, conj=False, handle=None): if handle is None: handle = misc._global_cublas_handle if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') if (a_gpu.dtype == np.complex64): func = cublas.cublasCgeam elif (a_gpu.dtype == np.float32): func = cublas.cublasSgeam elif (a_gpu.dtype == np.complex128): func = cublas.cublasZgeam elif (a_gpu.dtype == np.float64): func = cublas.cublasDgeam else: raise ValueError('unsupported input type') if conj: transa = 'c' else: transa = 't' M, N = a_gpu.shape at_gpu = gpuarray.empty((N, M), a_gpu.dtype) func(handle, transa, 'n', M, N, 1.0, a_gpu.gpudata, N, 0.0, a_gpu.gpudata, N, at_gpu.gpudata, M) return at_gpu
[docs]def transpose(a_gpu, handle=None): """ Matrix transpose. Transpose a matrix in device memory and return an object representing the transposed matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. Returns ------- at_gpu : pycuda.gpuarray.GPUArray Transposed matrix of shape `(n, m)`. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Examples -------- >>> import pycuda.autoinit >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> at_gpu = linalg.transpose(a_gpu) >>> np.all(a.T == at_gpu.get()) True >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> bt_gpu = linalg.transpose(b_gpu) >>> np.all(b.T == bt_gpu.get()) True """ return _transpose(a_gpu, False)
[docs]def hermitian(a_gpu, handle=None): """ Hermitian (conjugate) matrix transpose. Conjugate transpose a matrix in device memory and return an object representing the transposed matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Returns ------- at_gpu : pycuda.gpuarray.GPUArray Transposed matrix of shape `(n, m)`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> at_gpu = linalg.hermitian(a_gpu) >>> np.all(a.T == at_gpu.get()) True >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> bt_gpu = linalg.hermitian(b_gpu) >>> np.all(np.conj(b.T) == bt_gpu.get()) True """ return _transpose(a_gpu, True)
[docs]def conj(x_gpu, overwrite=True): """ Complex conjugate. Compute the complex conjugate of the array in device memory. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array of shape `(m, n)`. overwrite : bool If true (default), save the result in the specified array. If false, return the result in a newly allocated array. Returns ------- xc_gpu : pycuda.gpuarray.GPUArray Conjugate of the input array. If `overwrite` is true, the returned matrix is the same as the input array. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = linalg.conj(x_gpu) >>> np.all(x == np.conj(y_gpu.get())) True """ # Don't attempt to process non-complex matrix types: if x_gpu.dtype in [np.float32, np.float64]: return x_gpu try: func = conj.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} *x, {ctype} *y".format(ctype=ctype), "y[i] = conj(x[i])") conj.cache[x_gpu.dtype] = func if overwrite: func(x_gpu, x_gpu) return x_gpu else: y_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, y_gpu) return y_gpu
conj.cache = {} diag_template = Template(""" #include <pycuda-complex.hpp> #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex<double> #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex<float> #else #define FLOAT float #endif #endif // Assumes that d already contains zeros in all positions. // N must contain the number of elements in v. __global__ void diag(FLOAT *v, FLOAT *d, int N) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; if (idx < N) d[idx*(N+1)] = v[idx]; } """)
[docs]def diag(v_gpu): """ Construct a diagonal matrix if input array is one-dimensional, or extracts diagonal entries of a two-dimensional array. --- If input-array is one-dimensional: Constructs a matrix in device memory whose diagonal elements correspond to the elements in the specified array; all non-diagonal elements are set to 0. --- If input-array is two-dimensional: Constructs an array in device memory whose elements correspond to the elements along the main-diagonal of the specified array. Parameters ---------- v_obj : pycuda.gpuarray.GPUArray Input array of shape `(n,m)`. Returns ------- d_gpu : pycuda.gpuarray.GPUArray ---If v_obj has shape `(n,1)`, output is diagonal matrix of dimensions `[n, n]`. ---If v_obj has shape `(n,m)`, output is array of length `min(n,m)`. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> v = np.array([1, 2, 3, 4, 5, 6], np.float32) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> np.all(d_gpu.get() == np.diag(v)) True >>> v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex64) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> np.all(d_gpu.get() == np.diag(v)) True >>> v = np.array([[1., 2., 3.],[4., 5., 6.]], np.float64) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> d_gpu array([ 1., 5.]) """ if v_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') alloc = misc._global_cublas_allocator if (len(v_gpu.shape) > 1) and (len(v_gpu.shape) < 3): if (v_gpu.dtype == np.complex64): func = cublas.cublasCcopy elif (v_gpu.dtype == np.float32): func = cublas.cublasScopy elif (v_gpu.dtype == np.complex128): func = cublas.cublasZcopy elif (v_gpu.dtype == np.float64): func = cublas.cublasDcopy else: raise ValueError('unsupported input type') n = min(v_gpu.shape) m = max(v_gpu.shape) # Allocate the output array d_gpu = gpuarray.empty(n, v_gpu.dtype.type, allocator=alloc) handle = misc._global_cublas_handle func(handle, n, v_gpu.gpudata, m+1, d_gpu.gpudata, 1) return d_gpu elif len(v_gpu.shape) >= 3: raise ValueError('input array cannot have greater than 2-dimensions') use_double = int(v_gpu.dtype in [np.float64, np.complex128]) use_complex = int(v_gpu.dtype in [np.complex64, np.complex128]) # Initialize output matrix: d_gpu = misc.zeros((v_gpu.size, v_gpu.size), v_gpu.dtype, allocator=alloc) # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, d_gpu.shape) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None diag_mod = \ SourceModule(diag_template.substitute(use_double=use_double, use_complex=use_complex), cache_dir=cache_dir) diag = diag_mod.get_function("diag") diag(v_gpu, d_gpu, np.uint32(v_gpu.size), block=block_dim, grid=grid_dim) return d_gpu
[docs]def eye(N, dtype=np.float32): """ Construct a 2D matrix with ones on the diagonal and zeros elsewhere. Constructs a matrix in device memory whose diagonal elements are set to 1 and non-diagonal elements are set to 0. Parameters ---------- N : int Number of rows or columns in the output matrix. dtype : type Matrix data type. Returns ------- e_gpu : pycuda.gpuarray.GPUArray Diagonal matrix of dimensions `[N, N]` with diagonal values set to 1. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> N = 5 >>> e_gpu = linalg.eye(N) >>> np.all(e_gpu.get() == np.eye(N)) True >>> e_gpu = linalg.eye(N, np.complex64) >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64)) True """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') if N <= 0: raise ValueError('N must be greater than 0') alloc = misc._global_cublas_allocator e_gpu = misc.zeros((N, N), dtype, allocator=alloc) func = el.ElementwiseKernel("{ctype} *e".format(ctype=tools.dtype_to_ctype(dtype)), "e[i] = 1") func(e_gpu, slice=slice(0, N*N, N+1)) return e_gpu
[docs]def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Suppress very small singular values: cutoff_gpu = gpuarray.max(s_gpu)*rcond ctype = tools.dtype_to_ctype(s_gpu.dtype) cutoff_func = el.ElementwiseKernel("{ctype} *s, {ctype} *cutoff".format(ctype=ctype), "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}") cutoff_func(s_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c')
tril_template = Template(""" #include <pycuda-complex.hpp> #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex<double> #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex<float> #else #define FLOAT float #endif #endif __global__ void tril(FLOAT *a, unsigned int N) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; unsigned int ix = idx/${cols}; unsigned int iy = idx%${cols}; if (idx < N) { if (ix < iy) a[idx] = 0.0; } } """)
[docs]def tril(a_gpu, overwrite=True, handle=None): """ Lower triangle of a matrix. Return the lower triangle of a square matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` overwrite : boolean If true (default), zero out the upper triangle of the matrix. If false, return the result in a newly allocated matrix. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.cuda.misc._global_cublas_handle` is used. Returns ------- l_gpu : pycuda.gpuarray The lower triangle of the original matrix. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> l_gpu = linalg.tril(a_gpu, False) >>> np.allclose(np.tril(a), l_gpu.get()) True """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator if len(a_gpu.shape) != 2 or a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('matrix must be square') if a_gpu.dtype == np.float32: swap_func = cublas.cublasSswap copy_func = cublas.cublasScopy use_double = 0 use_complex = 0 elif a_gpu.dtype == np.float64: swap_func = cublas.cublasDswap copy_func = cublas.cublasDcopy use_double = 1 use_complex = 0 elif a_gpu.dtype == np.complex64: swap_func = cublas.cublasCswap copy_func = cublas.cublasCcopy use_double = 0 use_complex = 1 elif a_gpu.dtype == np.complex128: swap_func = cublas.cublasZswap copy_func = cublas.cublasZcopy use_double = 1 use_complex = 1 else: raise ValueError('unrecognized type') N = a_gpu.shape[0] # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, a_gpu.shape) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None tril_mod = \ SourceModule(tril_template.substitute(use_double=use_double, use_complex=use_complex, cols=N), cache_dir=cache_dir) tril = tril_mod.get_function("tril") if not overwrite: a_orig_gpu = gpuarray.empty(a_gpu.shape, a_gpu.dtype, allocator=alloc) copy_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) tril(a_gpu, np.uint32(a_gpu.size), block=block_dim, grid=grid_dim) if overwrite: return a_gpu else: # Restore original contents of a_gpu: swap_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) return a_orig_gpu
[docs]def multiply(x_gpu, y_gpu, overwrite=True): """ Multiply arguments element-wise. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray Input arrays to be multiplied. dev : pycuda.driver.Device Device object to be used. overwrite : bool If true (default), return the result in `y_gpu`. is false, return the result in a newly allocated array. Returns ------- z_gpu : pycuda.gpuarray.GPUArray The element-wise product of the input arrays. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> y = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> z_gpu = linalg.multiply(x_gpu, y_gpu) >>> np.allclose(x*y, z_gpu.get()) True """ alloc = misc._global_cublas_allocator if x_gpu.shape != y_gpu.shape: raise ValueError('input arrays must have the same shape') if x_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') x_ctype = tools.dtype_to_ctype(x_gpu.dtype) y_ctype = tools.dtype_to_ctype(y_gpu.dtype) if overwrite: func = el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y".format(x_ctype=x_ctype, y_ctype=y_ctype), "y[i] *= x[i]") func(x_gpu, y_gpu) return y_gpu else: result_type = np.result_type(x_gpu.dtype, y_gpu.dtype) z_gpu = gpuarray.empty(x_gpu.shape, result_type, allocator=alloc) func = \ el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y, {z_type} *z".format(x_ctype=x_ctype, y_ctype=y_ctype, z_type=tools.dtype_to_ctype(result_type)), "z[i] = x[i]*y[i]") func(x_gpu, y_gpu, z_gpu) return z_gpu
[docs]def norm(x_gpu, handle=None): """ Euclidean norm (2-norm) of real vector. Computes the Euclidean norm of an array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. handle : int CUBLAS context. If no context is specified, the default handle from `scikits.misc._global_cublas_handle` is used. Returns ------- nrm : real Euclidean norm of `x`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> nrm = linalg.norm(x_gpu) >>> np.allclose(nrm, np.linalg.norm(x)) True >>> x_gpu = gpuarray.to_gpu(np.array([3+4j, 12-84j])) >>> linalg.norm(x_gpu) 85.0 """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 1: x_gpu = x_gpu.ravel() # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64): cublas_func = cublas.cublasScnrm2 elif (x_gpu.dtype == np.float32): cublas_func = cublas.cublasSnrm2 elif (x_gpu.dtype == np.complex128): cublas_func = cublas.cublasDznrm2 elif (x_gpu.dtype == np.float64): cublas_func = cublas.cublasDnrm2 else: raise ValueError('unsupported input type') return cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1)
[docs]def scale(alpha, x_gpu, alpha_real=False, handle=None): """ Scale a vector by a factor alpha. Parameters ---------- alpha : scalar Scale parameter x_gpu : pycuda.gpuarray.GPUArray Input array. alpha_real : bool If `True` and `x_gpu` is complex, then one of the specialized versions `cublasCsscal` or `cublasZdscal` is used which might improve performance for large arrays. (By default, `alpha` is coerced to the corresponding complex type.) handle : int CUBLAS context. If no context is specified, the default handle from `scikits.misc._global_cublas_handle` is used. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> alpha = 2.4 >>> linalg.scale(alpha, x_gpu) >>> np.allclose(x_gpu.get(), alpha*x) True """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 1: x_gpu = x_gpu.ravel() cublas_func = { np.float32: cublas.cublasSscal, np.float64: cublas.cublasDscal, np.complex64: cublas.cublasCsscal if alpha_real else cublas.cublasCscal, np.complex128: cublas.cublasZdscal if alpha_real else cublas.cublasZscal }.get(x_gpu.dtype.type, None) if cublas_func: return cublas_func(handle, x_gpu.size, alpha, x_gpu.gpudata, 1) else: raise ValueError('unsupported input type')
def inv(a_gpu, overwrite=False, ipiv_gpu=None): """ Compute the inverse of a matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Square (n, n) matrix to be inverted. overwrite : bool, optional Discard data in `a` (may improve performance). Default is False. ipiv_gpu : pycuda.gpuarray.GPUArray (optional) Temporary array of size n, can be supplied to save allocations. Returns ------- ainv_gpu : pycuda.gpuarray.GPUArray Inverse of the matrix `a`. Raises ------ LinAlgError : If `a` is singular. ValueError : * If `a` is not square, or not 2-dimensional. * If ipiv was not None but had the wrong dtype or shape. """ alloc = misc._global_cublas_allocator if len(a_gpu.shape) != 2 or a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('expected square matrix') if (a_gpu.dtype == np.complex64): getrf = cula.culaDeviceCgetrf getri = cula.culaDeviceCgetri elif (a_gpu.dtype == np.float32): getrf = cula.culaDeviceSgetrf getri = cula.culaDeviceSgetri elif (a_gpu.dtype == np.complex128): getrf = cula.culaDeviceZgetrf getri = cula.culaDeviceZgetri elif (a_gpu.dtype == np.float64): getrf = cula.culaDeviceDgetrf getri = cula.culaDeviceDgetri n = a_gpu.shape[0] if ipiv_gpu is None: ipiv_gpu = gpuarray.empty((n, 1), a_gpu.dtype, allocator=alloc) elif ipiv_gpu.dtype != a_gpu.dtype or np.prod(ipiv_gpu.shape) < n: raise ValueError('invalid ipiv provided') out = a_gpu if overwrite else a_gpu.copy() try: getrf(n, n, out.gpudata, n, ipiv_gpu.gpudata) getri(n, out.gpudata, n, ipiv_gpu.gpudata) except cula.culaDataError as e: raise LinAlgError(e) return out if __name__ == "__main__": import doctest doctest.testmod()