Commits

Karl Rupp committed d67ff14

ViennaCL: Added sequential AIJ matrix.

Compiles, 'make test' passes. Further testing required.
Injects similar to PETSC_HAVE_CUSP into existing functions.
A general concept for dealing with accelerators is required here, but postponed after release 3.4.
aijAssemble.cpp is a stub, also row-compressed CSR is not yet supported.

Comments (0)

Files changed (10)

include/petsc-private/matimpl.h

 #if defined(PETSC_HAVE_CUSP)
   PetscCUSPFlag          valid_GPU_matrix; /* flag pointing to the matrix on the gpu*/
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  PetscViennaCLFlag          valid_GPU_matrix; /* flag pointing to the matrix on the gpu*/
+#endif
   void                   *spptr;          /* pointer for special library like SuperLU */
   MatSolverPackage       solvertype;
   PetscViewer            viewonassembly;         /* the following are set in MatSetFromOptions() and used in MatAssemblyEnd() */
 PETSC_EXTERN PetscLogEvent MATMFFD_Mult;
 PETSC_EXTERN PetscLogEvent MAT_GetMultiProcBlock;
 PETSC_EXTERN PetscLogEvent MAT_CUSPCopyToGPU, MAT_CUSPARSECopyToGPU, MAT_SetValuesBatch, MAT_SetValuesBatchI, MAT_SetValuesBatchII, MAT_SetValuesBatchIII, MAT_SetValuesBatchIV;
+PETSC_EXTERN PetscLogEvent MAT_ViennaCLCopyToGPU;
 PETSC_EXTERN PetscLogEvent MAT_Merge;
 
 #endif

include/petscmat.h

 #define MATAIJCUSPARSE     "aijcusparse"
 #define MATSEQAIJCUSPARSE  "seqaijcusparse"
 #define MATMPIAIJCUSPARSE  "mpiaijcusparse"
+#define MATAIJVIENNACL     "aijviennacl"
+#define MATSEQAIJVIENNACL  "seqaijviennacl"
+#define MATMPIAIJVIENNACL  "mpiaijviennacl"
 #define MATAIJPERM         "aijperm"
 #define MATSEQAIJPERM      "seqaijperm"
 #define MATMPIAIJPERM      "mpiaijperm"
 PETSC_EXTERN PetscErrorCode MatCUSPSetFormat(Mat,MatCUSPFormatOperation,MatCUSPStorageFormat);
 #endif
 
+#if defined(PETSC_HAVE_VIENNACL)
+PETSC_EXTERN PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm,PetscInt,PetscInt,PetscInt,const PetscInt[],Mat*);
+PETSC_EXTERN PetscErrorCode MatCreateAIJViennaCL(MPI_Comm,PetscInt,PetscInt,PetscInt,PetscInt,PetscInt,const PetscInt[],PetscInt,const PetscInt[],Mat*);
+#endif
+
 /*
    PETSc interface to FFTW
 */

src/mat/impls/aij/seq/makefile

 SOURCEF  =
 SOURCEH  = aij.h
 LIBBASE  = libpetscmat
-DIRS     = superlu umfpack essl lusol matlab csrperm crl bas ftn-kernels seqcusp \
+DIRS     = superlu umfpack essl lusol matlab csrperm crl bas ftn-kernels seqcusp seqviennacl \
            cholmod seqcusparse
 MANSEC   = Mat
 LOCDIR   = src/mat/impls/aij/seq/

src/mat/impls/aij/seq/seqviennacl/aijAssemble.cpp

+#include "petscconf.h"
+
+#include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
+#include "petscbt.h"
+#include "../src/vec/vec/impls/dvecimpl.h"
+#include "petsc-private/vecimpl.h"
+
+
+#include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
+#include "viennacl/linalg/prod.hpp"
+
+
+
+// Ne: Number of elements
+// Nl: Number of dof per element
+#undef __FUNCT__
+#define __FUNCT__ "MatSetValuesBatch_SeqAIJViennaCL"
+PetscErrorCode MatSetValuesBatch_SeqAIJViennaCL(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
+{
+  //TODO
+  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Implementation of MatSetValuesBatch_SeqAIJViennaCL() missing.");
+}

src/mat/impls/aij/seq/seqviennacl/aijviennacl.cpp

+
+
+/*
+    Defines the basic matrix operations for the AIJ (compressed row)
+  matrix storage format.
+*/
+
+#include "petscconf.h"
+#include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
+#include "petscbt.h"
+#include "../src/vec/vec/impls/dvecimpl.h"
+#include "petsc-private/vecimpl.h"
+
+#include "../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h"
+
+
+#include <algorithm>
+#include <vector>
+#include <string>
+
+#include "viennacl/linalg/prod.hpp"
+
+#undef __FUNCT__
+#define __FUNCT__ "MatViennaCLCopyToGPU"
+PetscErrorCode MatViennaCLCopyToGPU(Mat A)
+{
+
+  Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
+  Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
+  //PetscInt           m               = A->rmap->n,*ii,*ridx;
+  PetscInt           *ii;
+  PetscErrorCode     ierr;
+
+
+  PetscFunctionBegin;
+  if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
+    ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
+
+    try {
+      //viennaclstruct->nonzerorow=0;
+      //for (PetscInt j = 0; j<m; j++) viennaclstruct->nonzerorow += (a->i[j+1] > a->i[j]);
+
+      viennaclstruct->mat = new ViennaCLAIJMatrix();
+      if (a->compressedrow.use) {
+        //m    = a->compressedrow.nrows;
+        ii   = a->compressedrow.i;
+        //ridx = a->compressedrow.rindex;
+        
+        viennaclstruct->mat->set(ii, a->j, a->a, A->rmap->n, A->cmap->n, a->nz);
+        
+        // TODO: Either convert to full CSR (inefficient), or hold row indices in temporary vector (requires additional bookkeeping for matrix-vector multiplications)
+        
+        SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
+      } else {
+        
+        // Since PetscInt is in general different from cl_uint, we have to convert:
+        viennacl::backend::mem_handle dummy;
+        
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
+        for (PetscInt i=0; i<=A->rmap->n; ++i)
+          row_buffer.set(i, (a->i)[i]);
+        
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
+        for (PetscInt i=0; i<a->nz; ++i)
+          col_buffer.set(i, (a->j)[i]);
+        
+        viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
+      }
+    } catch(char *ex) {
+      SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
+    }
+
+    A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
+
+    ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
+  }
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "MatViennaCLCopyFromGPU"
+PetscErrorCode MatViennaCLCopyFromGPU(Mat A, ViennaCLAIJMatrix *Agpu)
+{
+  //Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
+  Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
+  PetscInt           m               = A->rmap->n;
+  PetscErrorCode     ierr;
+
+
+  PetscFunctionBegin;
+  if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
+    try {
+      if (a->compressedrow.use) {
+        SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
+      } else {
+
+        if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
+        a->nz           = Agpu->nnz();
+        a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
+        A->preallocated = PETSC_TRUE;
+        if (a->singlemalloc) {
+          if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
+        } else {
+          if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
+          if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
+          if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
+        }
+        ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
+        ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
+
+        a->singlemalloc = PETSC_TRUE;
+
+        /* Setup row lengths */
+        if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
+        ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
+        ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
+
+        /* Copy data back from GPU */
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
+        
+        // copy row array
+        viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+        (a->i)[0] = row_buffer[0];
+        for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
+          (a->i)[i+1] = row_buffer[i+1];
+          a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
+        }
+        
+        // copy column indices
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
+        viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+        for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
+          (a->j)[i] = col_buffer[i];
+        
+        // copy nonzero entries directly to destination (no conversion required)
+        viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
+        
+        /* TODO: What about a->diag? */
+      }
+    } catch(char *ex) {
+      SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex);
+    }
+
+    /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
+    ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
+    ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
+
+    A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
+  } else {
+    SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
+  }
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
+PetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
+{
+  PetscErrorCode ierr;
+
+  PetscFunctionBegin;
+  if (right) {
+    ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
+    ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
+    ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
+    ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
+    ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
+  }
+  if (left) {
+    ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
+    ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
+    ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
+    ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
+    ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
+  }
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "MatMult_SeqAIJViennaCL"
+PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
+{
+  Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
+  PetscErrorCode     ierr;
+  Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
+  ViennaCLVector     *xgpu=NULL,*ygpu=NULL;
+
+  PetscFunctionBegin;
+  ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
+  ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
+  try {
+    *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
+  } catch (char *ex) {
+    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
+  }
+  ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
+  ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
+  ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
+  PetscFunctionReturn(0);
+}
+
+
+
+#undef __FUNCT__
+#define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
+PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
+{
+  Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
+  PetscErrorCode     ierr;
+  Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
+  ViennaCLVector     *xgpu=NULL,*ygpu=NULL,*zgpu=NULL;
+
+  PetscFunctionBegin;
+  try {
+    ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
+    ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
+    ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
+
+    if (a->compressedrow.use) {
+        SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Compressed CSR (only nonzero rows) not yet supported");
+    } else {
+      if (zz == xx || zz == yy) { //temporary required
+        ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
+        *zgpu = *ygpu + temp;
+      } else {
+        *zgpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
+        *zgpu += *ygpu;
+      }
+    }
+
+    ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
+    ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
+    ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
+
+  } catch(char *ex) {
+    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
+  }
+  ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
+PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
+{
+  PetscErrorCode ierr;
+
+  PetscFunctionBegin;
+  ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
+  ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
+  if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
+  A->ops->mult    = MatMult_SeqAIJViennaCL;
+  A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
+  PetscFunctionReturn(0);
+}
+
+/* --------------------------------------------------------------------------------*/
+#undef __FUNCT__
+#define __FUNCT__ "MatCreateSeqAIJViennaCL"
+/*@
+   MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
+   (the default parallel PETSc format).  This matrix will ultimately pushed down
+   to GPUs and use the ViennaCL library for calculations. For good matrix
+   assembly performance the user should preallocate the matrix storage by setting
+   the parameter nz (or the array nnz).  By setting these parameters accurately,
+   performance during matrix assembly can be increased substantially.
+
+
+   Collective on MPI_Comm
+
+   Input Parameters:
++  comm - MPI communicator, set to PETSC_COMM_SELF
+.  m - number of rows
+.  n - number of columns
+.  nz - number of nonzeros per row (same for all rows)
+-  nnz - array containing the number of nonzeros in the various rows
+         (possibly different for each row) or NULL
+
+   Output Parameter:
+.  A - the matrix
+
+   It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
+   MatXXXXSetPreallocation() paradigm instead of this routine directly.
+   [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
+
+   Notes:
+   If nnz is given then nz is ignored
+
+   The AIJ format (also called the Yale sparse matrix format or
+   compressed row storage), is fully compatible with standard Fortran 77
+   storage.  That is, the stored row and column indices can begin at
+   either one (as in Fortran) or zero.  See the users' manual for details.
+
+   Specify the preallocated storage with either nz or nnz (not both).
+   Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
+   allocation.  For large problems you MUST preallocate memory or you
+   will get TERRIBLE performance, see the users' manual chapter on matrices.
+
+   By default, this format uses inodes (identical nodes) when possible, to
+   improve numerical efficiency of matrix-vector products and solves. We
+   search for consecutive rows with the same nonzero structure, thereby
+   reusing matrix information to achieve increased efficiency.
+
+   Level: intermediate
+
+.seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
+
+@*/
+PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
+{
+  PetscErrorCode ierr;
+
+  PetscFunctionBegin;
+  ierr = MatCreate(comm,A);CHKERRQ(ierr);
+  ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
+  ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
+  ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
+  PetscFunctionReturn(0);
+}
+
+
+#undef __FUNCT__
+#define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
+PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
+{
+  PetscErrorCode ierr;
+  Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
+
+  PetscFunctionBegin;
+  try {
+    if (A->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) delete (ViennaCLAIJMatrix*)(viennaclcontainer->mat);
+    delete viennaclcontainer;
+    A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
+  } catch(char *ex) {
+    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
+  }
+  /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
+  A->spptr = 0;
+  ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
+  PetscFunctionReturn(0);
+}
+
+
+extern PetscErrorCode MatSetValuesBatch_SeqAIJViennaCL(Mat, PetscInt, PetscInt, PetscInt*,const PetscScalar*);
+
+
+#undef __FUNCT__
+#define __FUNCT__ "MatCreate_SeqAIJViennaCL"
+PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
+{
+  PetscErrorCode ierr;
+  Mat_SeqAIJ     *aij;
+
+  PetscFunctionBegin;
+  ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
+  aij             = (Mat_SeqAIJ*)B->data;
+  aij->inode.use  = PETSC_FALSE;
+  B->ops->mult    = MatMult_SeqAIJViennaCL;
+  B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
+  B->spptr        = new Mat_SeqAIJViennaCL();
+
+  ((Mat_SeqAIJViennaCL*)B->spptr)->mat     = 0;
+
+  B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
+  B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
+  B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
+  B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJViennaCL;
+
+  //ierr = PetscObjectComposeFunction((PetscObject)B,"MatViennaCLSetFormat_C", "MatViennaCLSetFormat_SeqAIJViennaCL", MatViennaCLSetFormat_SeqAIJViennaCL);CHKERRQ(ierr);
+  ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
+
+  B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
+  PetscFunctionReturn(0);
+}
+
+
+/*M
+   MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
+
+   A matrix type type whose data resides on GPUs. These matrices are in CSR format by
+   default. All matrix calculations are performed using the ViennaCL library.
+
+   Options Database Keys:
++  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
+.  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
+-  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
+
+  Level: beginner
+
+.seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
+M*/
+

src/mat/impls/aij/seq/seqviennacl/makefile

+#requirespackage 'PETSC_HAVE_VIENNACL'
+#requireslanguage CXXONLY
+
+ALL: lib
+
+CFLAGS    =
+FFLAGS    =
+SOURCEC   = aijviennacl.cpp aijAssemble.cpp
+SOURCEF   =
+SOURCEH   = viennaclmatimpl.h
+LIBBASE   = libpetscmat
+DIRS      =
+MANSEC    = Mat
+LOCDIR    = src/mat/impls/aij/seq/seqviennacl/
+
+include ${PETSC_DIR}/conf/variables
+include ${PETSC_DIR}/conf/rules
+include ${PETSC_DIR}/conf/test

src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h

+#if !defined(__VIENNACLMATIMPL)
+#define __VIENNACLMATIMPL
+
+#include <../src/vec/vec/impls/seq/seqviennacl/viennaclvecimpl.h>
+
+/* for everything else */
+#include "viennacl/compressed_matrix.hpp"
+
+
+/* Old way */
+typedef viennacl::compressed_matrix<PetscScalar>   ViennaCLAIJMatrix;
+
+
+struct Mat_SeqAIJViennaCL {
+  ViennaCLAIJMatrix      *mat;  /* pointer to the matrix on the GPU */
+};
+
+
+PETSC_INTERN PetscErrorCode MatViennaCLCopyToGPU(Mat);
+PETSC_INTERN PetscErrorCode MatViennaCLCopyFromGPU(Mat, ViennaCLAIJMatrix*);
+#endif

src/mat/interface/matregis.c

 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat);
 #endif
 
+#if defined PETSC_HAVE_VIENNACL
+PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat);
+//PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJViennaCL(Mat);
+#endif
+
 #if defined PETSC_HAVE_FFTW
 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat);
 #endif

src/mat/interface/matrix.c

 PetscLogEvent MAT_Applypapt, MAT_Applypapt_numeric, MAT_Applypapt_symbolic, MAT_GetSequentialNonzeroStructure;
 PetscLogEvent MAT_GetMultiProcBlock;
 PetscLogEvent MAT_CUSPCopyToGPU, MAT_CUSPARSECopyToGPU, MAT_SetValuesBatch, MAT_SetValuesBatchI, MAT_SetValuesBatchII, MAT_SetValuesBatchIII, MAT_SetValuesBatchIV;
+PetscLogEvent MAT_ViennaCLCopyToGPU;
 PetscLogEvent MAT_Merge;
 
 const char *const MatFactorTypes[] = {"NONE","LU","CHOLESKY","ILU","ICC","ILUDT","MatFactorType","MAT_FACTOR_",0};
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 #else
   return 0;
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   if (inassm == 1 && type != MAT_FLUSH_ASSEMBLY) {
     if (mat->viewonassembly) {
       ierr = PetscViewerPushFormat(mat->viewonassembly,mat->viewformatonassembly);CHKERRQ(ierr);
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     mat->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (mat->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    mat->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 

src/mat/utils/axpy.c

     Y->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (Y->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    Y->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }
 
     Y->valid_GPU_matrix = PETSC_CUSP_CPU;
   }
 #endif
+#if defined(PETSC_HAVE_VIENNACL)
+  if (Y->valid_GPU_matrix != PETSC_VIENNACL_UNALLOCATED) {
+    Y->valid_GPU_matrix = PETSC_VIENNACL_CPU;
+  }
+#endif
   PetscFunctionReturn(0);
 }