Commits

Kashif Rasul committed afd46d6

whitespace

  • Participants
  • Parent commits b37add0

Comments (0)

Files changed (4)

 
 extern int cuda_set_device(int deviceId) {
     cudaSetDevice(deviceId);
-    
+
     if (checkCUDAError())
         return CUDA_ERROR;
     else
         return 0;
 }
 
-extern int init_random(unsigned long long seed, curandRngType_t rngType) {    
+extern int init_random(unsigned long long seed, curandRngType_t rngType) {
     curandStatus_t status;
 
     status = curandCreateGenerator(&gen, rngType);
             return CUBLAS_ERROR;
     } else
        return ERROR_NOT_ON_DEVICE;
- 
+
     return 0;
 }
 
     }
 
     status = cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1);
-    
+
     if (check_cublas_error(status))
         return CUBLAS_ERROR;
 
         k = get_leading_dimension(mat2),
         n = get_nonleading_dimension(mat2);
 
-    status = cublasSgemm(handle, get_transpose_char(mat1), get_transpose_char(mat2), 
+    status = cublasSgemm(handle, get_transpose_char(mat1), get_transpose_char(mat2),
                 m, n, k,
                 &alpha, mat1->data_device, mat1->size[0],
                 mat2->data_device, mat2->size[0],
     if (check_cublas_error(status))
         return CUBLAS_ERROR;
 
-    if (SYNC_THREADS) 
+    if (SYNC_THREADS)
         cudaThreadSynchronize();
 
     return 0;
         return 0;
     }
 
-    if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) { 
+    if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1]) {
         *err_code = ERROR_INCOMPATIBLE_DIMENSIONS;
         return 0;
     }
     if (mat1 == target) {
         const float floatone = 1.0;
         status = cublasSaxpy(handle, len, &floatone, mat2->data_device, 1, mat1->data_device, 1);
- 
+
         if (check_cublas_error(status))
             return CUBLAS_ERROR;
 
     } else {
         kAdd<<<NUM_VECTOR_OP_BLOCKS,NUM_VECTOR_OP_THREADS_PER_BLOCK>>>(mat1->data_device, mat2->data_device, target->data_device, len);
- 
+
         if (SYNC_THREADS)
             cudaThreadSynchronize();
 
         if (checkCUDAError())
             return CUDA_ERROR;
     }
- 
+
      return 0;
 }
 
 
     if (mat == target) {
         status = cublasSscal(handle, len, &alpha, mat->data_device, 1);
- 
+
         if (check_cublas_error(status))
             return CUBLAS_ERROR;
 
     } else {
         kMultScalar<<<NUM_VECTOR_OP_BLOCKS,NUM_VECTOR_OP_THREADS_PER_BLOCK>>>(mat->data_device, alpha, target->data_device, len);
 
-        if (SYNC_THREADS) 
+        if (SYNC_THREADS)
             cudaThreadSynchronize();
 
         if (checkCUDAError())
             return CUDA_ERROR;
     }
- 
+
     return 0;
 }
 
     elif err_code == -8:
         return CUDAMatException("Matrix is not in device memory.")
     elif err_code == -9:
-        return CUDAMatException("Operation not supported.")        
+        return CUDAMatException("Operation not supported.")
 
 class cudamat(ct.Structure):
     _fields_ = [('data_host', ct.POINTER(ct.c_float)),
             err_code = _cudamat.assign_scalar(self.p_mat, ct.c_float(val))
         else:
             raise ValueError, "Assigned value must be of type CUDAMatrix, int, or float."
-            
+
         if err_code:
             raise generate_exception(err_code)
 
         distribution over the (0,1] interval.
         """
 
-        err_code = _cudamat.fill_with_rand(self.p_mat) 
+        err_code = _cudamat.fill_with_rand(self.p_mat)
         if err_code:
             raise generate_exception(err_code)
 
             raise generate_exception(err_code)
 
         return target
-        
+
     def add_col_mult(self, vec, mult, target = None):
         """
         Add a multiple of vector vec to every column of the matrix. If a target
             raise generate_exception(err_code)
 
         return target
-        
+
     def add_row_vec(self, vec, target = None):
         """
         Add vector vec to every row of the matrix. If a target is provided,
             raise generate_exception(err_code)
 
         return target
-        
+
     def mult_by_col(self, vec, target = None):
         """
         Multiply vector vec into every column of the matrix. If a target is
             raise generate_exception(err_code)
 
         return target
-        
+
     def mult_by_row(self, vec, target = None):
         """
         Multiply vector vec into every row of the matrix. If a target is
             raise generate_exception(err_code)
 
         return target
-        
+
     def sum(self, axis, target = None):
         """
         Sum the matrix along the given dimension, where 0 represents the leading
     def add_sums(self, mat, axis, mult = 1.):
         """
         Add a multiple of the sums of the matrix mat along the given dimension
-        to self. 
+        to self.
         """
 
         m = _cudamat.get_leading_dimension(mat.p_mat)
             left = CUDAMatrix.ones.slice(0, m)
             left.set_trans(True)
             right = mat
- 
+
         elif axis == 1:
             # sum along non-leading dimension
             left = mat
         if axis == 0:
             if not target:
                 target = empty((1, n))
- 
+
         elif axis == 1:
             if not target:
                 target = empty((m, 1))
             raise generate_exception(err_code)
 
         return self
-    
+
     def subtract_mult(self, mat2, alpha = 1.):
         """
         Subtract a multiple of mat2 from the matrix.
 
         if not target:
             target = empty((1, n))
- 
+
     elif axis == 1:
         # sum along non-leading dimension
         left = mat

cudamat_kernels.cu

     __shared__ float max_vals[32];
     float cur_max = -FLT_MAX;
     float val = 0;
- 
+
     for (unsigned int i = threadIdx.x; i < height; i += 32) {
         val = mat[blockIdx.x * height + i];
 
 __global__ void kApplyAbs(float* mat, float* target, unsigned int len) {
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int numThreads = blockDim.x * gridDim.x;
-    
+
     for (unsigned int i = idx; i < len; i += numThreads) {
         target[i] = mat[i] * ((mat[i] > 0) - (mat[i] < 0));
     }
     n = 128
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
 
     m = 256
     n = 128
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     m1 = cm.CUDAMatrix(a)
 
     m1.assign(np.pi)
 
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(end-start, n)*10, dtype=np.float32, order='F')
-    
+
     c = np.array(a[start:end,:], order='F')
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m1.get_row_slice(start, end, target = m2)
 
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(end-start, n)*10, dtype=np.float32, order='F')
-    
+
     c = a.copy()
     c[start:end,:] = b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m1.set_row_slice(start, end, m2)
 
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(n, m), dtype=np.float32, order='F')
-    
+
     c = a.copy().T
-    
+
     m = cm.CUDAMatrix(a)
     mt1 = cm.CUDAMatrix(b)
     m.transpose(target = mt1)
     m = 256
     n = 128
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = np.array(a[:,32:64], order='F')
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = m1.slice(32, 64)
     m2.copy_to_host()
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = a + b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m3 = cm.CUDAMatrix(t)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = a + mult * b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m3 = cm.CUDAMatrix(t)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = a + b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m3 = cm.CUDAMatrix(t)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = a * b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m3 = cm.CUDAMatrix(t)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     b = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     c = a * b
-    
+
     m1 = cm.CUDAMatrix(a)
     m2 = cm.CUDAMatrix(b)
     m3 = cm.CUDAMatrix(t)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     t1 = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
     t2 = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F')
-    
+
     c1 = np.atleast_2d(a.sum(0))
     c2 = np.atleast_2d(a.sum(1)).T
-    
+
     m = cm.CUDAMatrix(a)
     mt1 = cm.CUDAMatrix(t1)
     mt2 = cm.CUDAMatrix(t2)
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
     t1 = np.array(np.random.rand(1, m)*10, dtype=np.float32, order='F')
     t2 = np.array(np.random.rand(n, 1)*10, dtype=np.float32, order='F')
-    
+
     c1 = np.atleast_2d(a.T.sum(0))
     c2 = np.atleast_2d(a.T.sum(1)).T
-    
+
     m = cm.CUDAMatrix(a)
     m.set_trans(True)
     mt1 = cm.CUDAMatrix(t1)
     t2 = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
 
     mult = np.pi
-    
+
     c1 = t1 + mult * np.atleast_2d(a.sum(1)).T
     c2 = t2 + np.atleast_2d(a.sum(0))
-    
+
     m = cm.CUDAMatrix(a)
     mt1 = cm.CUDAMatrix(t1)
     mt2 = cm.CUDAMatrix(t2)
     n = 128
     a = np.array(np.random.randn(m, n)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
-   
-    r = np.atleast_2d(a.max(0)) 
-    
+
+    r = np.atleast_2d(a.max(0))
+
     da = cm.CUDAMatrix(a)
     dr1 = cm.CUDAMatrix(t)
 
     n = 128
     a = np.array(-np.random.rand(m, n)*10, dtype=np.float32, order='F')
     t = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F')
-   
-    r = np.atleast_2d(a.max(0)) 
-    
+
+    r = np.atleast_2d(a.max(0))
+
     da = cm.CUDAMatrix(a)
     dr1 = cm.CUDAMatrix(t)
 
 
 def test_random():
     cm.CUDAMatrix.init_random(1)
-    m1 = cm.CUDAMatrix(np.array(np.empty((128,256)), dtype=np.float32, order='F'))
-    m2 = cm.CUDAMatrix(np.array(np.empty((128,256)), dtype=np.float32, order='F'))
+    m1 = cm.CUDAMatrix(np.array(np.empty((256,256)), dtype=np.float32, order='F'))
+    m2 = cm.CUDAMatrix(np.array(np.empty((256,256)), dtype=np.float32, order='F'))
 
     m1.fill_with_rand()
     m1.copy_to_host()
     m = 256
     n = 128
     a = np.array(np.random.rand(m, n)*10, dtype=np.float32, order='F')
-    
+
     m = cm.CUDAMatrix(a)
 
     n1 = np.sqrt(np.sum(a**2))
     i_l = [0, 1, 2, 3, 5, 10, 12, 20]
     i = np.array(i_l).T[np.newaxis, :]
     t = np.empty((m, k))
-    
+
     s_d = cm.CUDAMatrix(s)
     i_d = cm.CUDAMatrix(i)
     t_d = cm.CUDAMatrix(t)