Commits

Kashif Rasul committed 3abb9a6

use cublas_v2 API

  • Participants
  • Parent commits d8287f0

Comments (0)

Files changed (2)

 #include <stdio.h>
 #include <stdlib.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #include "cudamat_kernels.cuh"
 #include "cudamat.cuh"
 
 
 /* ------------------------------ CUBLAS init/shutdown ------------------------------ */
 
-inline bool check_cublas_error() {
-    cublasStatus status = cublasGetError();
-
+inline bool check_cublas_error(cublasStatus_t status) {
     return status != CUBLAS_STATUS_SUCCESS;
 }
 
 }
 
 extern int cublas_init() {
-    cublasInit();
-    if (check_cublas_error())
+    cublasStatus_t status;
+    status = cublasCreate(&handle);
+    if (check_cublas_error(status))
         return CUBLAS_ERROR;
     else
         return 0;
 }
 
 extern int cublas_shutdown() {
-    cublasShutdown();
-    cudaThreadExit();
+    cublasDestroy(handle);
+    cudaDeviceReset();
 
     return 0;
 }
 }
 
 extern int init_random(rnd_struct* rnd_state, int seed, char* cudamatpath) {
+    cublasStatus_t status;
+    cudaError_t err = cudaSuccess;
     unsigned int * host_mults;
     host_mults = (unsigned int*)malloc(NUM_RND_STREAMS * sizeof(unsigned int));
     FILE * pFile;
     }
     fclose (pFile);
 
-    cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned int), (void**)&rnd_state->dev_mults);
-    cublasAlloc(NUM_RND_STREAMS, sizeof(unsigned long long), (void**)&rnd_state->dev_words);
-    cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1);
-    //cudaMalloc((void **)&rnd_state->dev_mults, NUM_RND_STREAMS * sizeof(unsigned int));
-    //cudaMalloc((void **)&rnd_state->dev_words, NUM_RND_STREAMS * sizeof(unsigned long long));
-    //cudaMemcpy(rnd_state->dev_mults, host_mults, NUM_RND_STREAMS * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    err = cudaMalloc((void**)&rnd_state->dev_mults, NUM_RND_STREAMS*sizeof(unsigned int));
+    if (err != cudaSuccess)
+        return 0;
+    err = cudaMalloc((void**)&rnd_state->dev_words, NUM_RND_STREAMS*sizeof(unsigned long long));
+    if (err != cudaSuccess)
+        return 0;
+    status = cublasSetVector(NUM_RND_STREAMS, sizeof(unsigned int), host_mults, 1, rnd_state->dev_mults, 1);
+    if (check_cublas_error(status))
+        return 0;
     cudaThreadSynchronize();
 
     kSeedRandom<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(rnd_state->dev_mults, rnd_state->dev_words, seed);
- 
     cudaThreadSynchronize();
 
     if (checkCUDAError())
     mat->is_trans = is_trans;
 }
 
-inline char get_transpose_char(cudamat* mat) {
-    return mat->is_trans ? 't' : 'n';
+inline cublasOperation_t get_transpose_char(cudamat* mat) {
+    return mat->is_trans ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
 
 extern void cuda_sync_threads() {
 extern int allocate_device_memory(cudamat* mat) {
     int len = mat->size[0]*mat->size[1];
 
-    cublasStatus stat;
+    cudaError_t err = cudaSuccess;
 
-    stat = cublasAlloc(len, sizeof(mat->data_device[0]), (void**)&mat->data_device);
+    err = cudaMalloc((void**)&mat->data_device, len*sizeof(mat->data_device[0]));
 
-    if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error()) {
-        checkCUDAError();
-        return CUBLAS_ERROR;
+    if (err != cudaSuccess) {
+        return err;
     }
 
     mat->on_device = 1;
 }
 
 extern int copy_to_host(cudamat* mat) {
+    cublasStatus_t status;
     int len = mat->size[0]*mat->size[1];
 
     if (mat->on_device) {
-            cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1);
+            status = cublasGetVector(len, sizeof(mat->data_host[0]), mat->data_device, 1, mat->data_host, 1);
 
-        if (check_cublas_error())
+        if (check_cublas_error(status))
             return CUBLAS_ERROR;
     } else
        return ERROR_NOT_ON_DEVICE;
 }
 
 extern int copy_to_device(cudamat* mat) {
+    cublasStatus_t status;
     int len = mat->size[0]*mat->size[1];
     int err_code = 0;
 
             return err_code;
     }
 
-    cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1);
+    status = cublasSetVector(len, sizeof(mat->data_host[0]), mat->data_host, 1, mat->data_device, 1);
     
-    if (check_cublas_error())
+    if (check_cublas_error(status))
         return CUBLAS_ERROR;
 
     return 0;
 }
 
 extern int copy_on_device(cudamat* mat1, cudamat* mat2) {
+    cublasStatus_t status;
     int len = mat1->size[0]*mat1->size[1];
 
     if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
         return ERROR_INCOMPATIBLE_DIMENSIONS;
 
-    cublasScopy(len, mat1->data_device, 1, mat2->data_device, 1);
+    status = cublasScopy(handle, len, mat1->data_device, 1, mat2->data_device, 1);
 
-    if (check_cublas_error())
+    if (check_cublas_error(status))
         return CUBLAS_ERROR;
     else
         return 0;
 
 extern int free_device_memory(cudamat* mat) {
     if (mat->owns_data && mat->on_device) {
-        cublasStatus stat;
+        cudaError_t err = cudaSuccess;
 
-        stat = cublasFree(mat->data_device);
+        err = cudaFree(mat->data_device);
         mat->on_device = 0;
 
-        if (stat != CUBLAS_STATUS_SUCCESS || check_cublas_error())
-            return CUBLAS_ERROR;
+        if (err != cudaSuccess)
+            return CUDA_ERROR;
     }
 
     return 0;
 }
 
 extern int dot(cudamat* mat1, cudamat* mat2, cudamat* target, float beta, float alpha) {
+    cublasStatus_t status;
     if (!mat1->on_device || !mat2->on_device || !target->on_device)
         return ERROR_NOT_ON_DEVICE;
 
         k = get_leading_dimension(mat2),
         n = get_nonleading_dimension(mat2);
 
-    cublasSgemm(get_transpose_char(mat1), get_transpose_char(mat2), 
+    status = cublasSgemm(handle, get_transpose_char(mat1), get_transpose_char(mat2), 
                 m, n, k,
-                alpha, mat1->data_device, mat1->size[0],
+                &alpha, mat1->data_device, mat1->size[0],
                 mat2->data_device, mat2->size[0],
-                beta, target->data_device, target->size[0]);
+                &beta, target->data_device, target->size[0]);
 
-    if (check_cublas_error())
+    if (check_cublas_error(status))
         return CUBLAS_ERROR;
 
     if (SYNC_THREADS) 
 }
 
 extern float vdot(cudamat* mat1, cudamat* mat2, int* err_code) {
+    cublasStatus_t status;
     int len = mat1->size[0]*mat1->size[1];
     float res;
 
         return 0;
     }
 
-    res = cublasSdot(len, mat1->data_device, 1, mat2->data_device, 1);
+    status = cublasSdot(handle, len, mat1->data_device, 1, mat2->data_device, 1, &res);
 
-    if (check_cublas_error()) {
+    if (check_cublas_error(status)) {
         *err_code = CUBLAS_ERROR;
         return -1.;
     } else {
 /* Perform the operation mat1 = mat1 + alpha * mat2. mat1 and mat2 must
    have the same transposedness. */
 extern int add_mult(cudamat* mat1, cudamat* mat2, float alpha) {
+    cublasStatus_t status;
     int len = mat1->size[0]*mat1->size[1];
 
     if (!mat1->on_device || !mat2->on_device)
     if (mat1->size[0] != mat2->size[0] || mat1->size[1] != mat2->size[1])
         return ERROR_INCOMPATIBLE_DIMENSIONS;
 
-    cublasSaxpy(len, alpha, mat2->data_device, 1, mat1->data_device, 1);
+    status = cublasSaxpy(handle, len, &alpha, mat2->data_device, 1, mat1->data_device, 1);
 
-    if (check_cublas_error())
+    if (check_cublas_error(status))
         return CUBLAS_ERROR;
 
     return 0;
 }
 
 extern int add_elementwise(cudamat* mat1, cudamat* mat2, cudamat* target) {
+    cublasStatus_t status;
     int len = mat1->size[0]*mat1->size[1];
 
     if (!mat1->on_device || !mat2->on_device || !target->on_device)
         return ERROR_INCOMPATIBLE_DIMENSIONS;
 
     if (mat1 == target) {
-        cublasSaxpy(len, 1, mat2->data_device, 1, mat1->data_device, 1);
+        const float floatone = 1.0;
+        status = cublasSaxpy(handle, len, &floatone, mat2->data_device, 1, mat1->data_device, 1);
  
-        if (check_cublas_error())
+        if (check_cublas_error(status))
             return CUBLAS_ERROR;
 
     } else {
 }
 
 extern int mult_by_scalar(cudamat* mat, float alpha, cudamat* target) {
+    cublasStatus_t status;
     int len = mat->size[0]*mat->size[1];
 
     if (!mat->on_device || !target->on_device)
         return ERROR_INCOMPATIBLE_DIMENSIONS;
 
     if (mat == target) {
-        cublasSscal(len, alpha, mat->data_device, 1);
+        status = cublasSscal(handle, len, &alpha, mat->data_device, 1);
  
-        if (check_cublas_error())
+        if (check_cublas_error(status))
             return CUBLAS_ERROR;
 
     } else {
 }
 
 extern float euclid_norm(cudamat* mat, int* err_code) {
+    cublasStatus_t status;
     int len = mat->size[0]*mat->size[1];
 
-    float res =  cublasSnrm2(len, mat->data_device, 1);
+    float res;
+    status = cublasSnrm2(handle, len, mat->data_device, 1, &res);
 
     if (!mat->on_device)
         return ERROR_NOT_ON_DEVICE;
 
-    if (check_cublas_error()) {
+    if (check_cublas_error(status)) {
         *err_code = CUBLAS_ERROR;
         return -1.;
     } else {
 #define ERROR_NOT_ON_DEVICE -8
 #define ERROR_UNSUPPORTED -9
 
+cublasHandle_t handle;
+
 struct cudamat {
     float* data_host;
     float* data_device;