Commits

Karl Rupp  committed 603759c Merge

Merge branch 'paulmullowney/cusp-vector-scatter-with-fix'

* paulmullowney/cusp-vector-scatter-with-fix:
CUDA/CUSP: Implementation of Sequential to Sequential Vector Scatters

  • Participants
  • Parent commits 76ef003, 8ab02f2

Comments (0)

Files changed (13)

File include/petscvec.h

 
 #if defined(PETSC_HAVE_CUSP)
 typedef struct _p_PetscCUSPIndices* PetscCUSPIndices;
-PETSC_EXTERN PetscErrorCode PetscCUSPIndicesCreate(PetscInt, PetscInt*,PetscInt, PetscInt*,PetscCUSPIndices*);
-PETSC_EXTERN PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices*);
+typedef struct _p_VecScatterCUSPIndices_StoS* VecScatterCUSPIndices_StoS;
+typedef struct _p_VecScatterCUSPIndices_PtoP* VecScatterCUSPIndices_PtoP;
 PETSC_EXTERN PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec,PetscCUSPIndices);
 PETSC_EXTERN PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec,PetscCUSPIndices);
 PETSC_EXTERN PetscErrorCode VecScatterInitializeForGPU(VecScatter,Vec,ScatterMode);

File src/mat/impls/aij/seq/seqcusp/aijcusp.cu

       cuspstruct->nonzerorow=0;
       for (int j = 0; j<m; j++) cuspstruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
 
-      mat = new CUSPMATRIX;
       if (a->compressedrow.use) {
         m    = a->compressedrow.nrows;
         ii   = a->compressedrow.i;
         ridx = a->compressedrow.rindex;
-        mat->resize(m,A->cmap->n,a->nz);
-        mat->row_offsets.assign(ii,ii+m+1);
-        mat->column_indices.assign(a->j,a->j+a->nz);
-        mat->values.assign(a->a,a->a+a->nz);
-        cuspstruct->indices = new CUSPINTARRAYGPU;
-        cuspstruct->indices->assign(ridx,ridx+m);
+      } else {
+        /* Forcing compressed row on the GPU */
+        int k=0;
+        ierr = PetscMalloc((cuspstruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
+        ierr = PetscMalloc((cuspstruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
+        ii[0]=0;
+        for (int j = 0; j<m; j++) {
+          if ((a->i[j+1]-a->i[j])>0) {
+            ii[k]  = a->i[j];
+            ridx[k]= j;
+            k++;
+          }
+        }
+        ii[cuspstruct->nonzerorow] = a->nz;
+        m = cuspstruct->nonzerorow;
+      }
+
+      /* now build matrix */
+      mat = new CUSPMATRIX;
+      mat->resize(m,A->cmap->n,a->nz);
+      mat->row_offsets.assign(ii,ii+m+1);
+      mat->column_indices.assign(a->j,a->j+a->nz);
+      mat->values.assign(a->a,a->a+a->nz);
+
+      /* convert to other formats if selected */
+      if (a->compressedrow.use || cuspstruct->format==MAT_CUSP_CSR) {
 	cuspstruct->mat = mat;
 	cuspstruct->format = MAT_CUSP_CSR;
       } else {
-        mat->resize(m,A->cmap->n,a->nz);
-        mat->row_offsets.assign(a->i,a->i+m+1);
-        mat->column_indices.assign(a->j,a->j+a->nz);
-        mat->values.assign(a->a,a->a+a->nz);
-
-	/* convert to other formats if selected */
-	if (cuspstruct->format!=MAT_CUSP_CSR) {
-	  if (cuspstruct->format==MAT_CUSP_ELL) {
-	    CUSPMATRIXELL *ellMat = new CUSPMATRIXELL(*mat);
-	    cuspstruct->mat = ellMat;
-	  } else {
-	    CUSPMATRIXDIA *diaMat = new CUSPMATRIXDIA(*mat);
-	    cuspstruct->mat = diaMat;
-	  }
-	  delete (CUSPMATRIX *) mat;
-	} else 
-	  cuspstruct->mat = mat;
+	if (cuspstruct->format==MAT_CUSP_ELL) {
+	  CUSPMATRIXELL *ellMat = new CUSPMATRIXELL(*mat);
+	  cuspstruct->mat = ellMat;
+	} else {
+	  CUSPMATRIXDIA *diaMat = new CUSPMATRIXDIA(*mat);
+	  cuspstruct->mat = diaMat;
+	}
+	delete (CUSPMATRIX*) mat;
+      }
+
+      /* assign the compressed row indices */
+      cuspstruct->indices = new CUSPINTARRAYGPU;
+      cuspstruct->indices->assign(ridx,ridx+m);
+
+      /* free the temporaries */
+      if (!a->compressedrow.use) {
+        ierr = PetscFree(ii);CHKERRQ(ierr);
+        ierr = PetscFree(ridx);CHKERRQ(ierr);
       }
       cuspstruct->tempvec = new CUSPARRAY;
       cuspstruct->tempvec->resize(m);
 {
   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
   PetscErrorCode ierr;
+  PetscBool      usecprow = a->compressedrow.use;
   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
   CUSPARRAY      *xarray     = NULL,*yarray=NULL,*zarray=NULL;
 
     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
 
-    if (a->compressedrow.use) {
+    if (usecprow) { 
+      /* use compressed row format */
       CUSPMATRIX *mat = (CUSPMATRIX*)cuspstruct->mat;
-      cusp::multiply(*mat,*xarray, *cuspstruct->tempvec);
-      thrust::for_each(
-        thrust::make_zip_iterator(
-          thrust::make_tuple(cuspstruct->tempvec->begin(),
-                             thrust::make_permutation_iterator(zarray->begin(), cuspstruct->indices->begin()))),
-        thrust::make_zip_iterator(
-          thrust::make_tuple(cuspstruct->tempvec->begin(),
-                             thrust::make_permutation_iterator(zarray->begin(),cuspstruct->indices->begin()))) + cuspstruct->tempvec->size(),
-        VecCUSPPlusEquals());
-    } else {
+      cusp::multiply(*mat,*xarray,*cuspstruct->tempvec);
+      ierr = VecSet_SeqCUSP(yy,0.0);CHKERRQ(ierr);
+      thrust::copy(cuspstruct->tempvec->begin(),cuspstruct->tempvec->end(),thrust::make_permutation_iterator(yarray->begin(),cuspstruct->indices->begin()));
+    } else { 
+
       if (cuspstruct->format==MAT_CUSP_ELL) {
 	CUSPMATRIXELL *mat = (CUSPMATRIXELL*)cuspstruct->mat;
 	cusp::multiply(*mat,*xarray,*cuspstruct->tempvec);
 	CUSPMATRIX *mat = (CUSPMATRIX*)cuspstruct->mat;
 	cusp::multiply(*mat,*xarray,*cuspstruct->tempvec);
       }
-      thrust::for_each(
-        thrust::make_zip_iterator(thrust::make_tuple(
-                                    cuspstruct->tempvec->begin(),
-                                    zarray->begin())),
-        thrust::make_zip_iterator(thrust::make_tuple(
-                                    cuspstruct->tempvec->end(),
-                                    zarray->end())),
-        VecCUSPPlusEquals());
+
+      if (zarray->size() == cuspstruct->indices->size()) {
+	thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cuspstruct->tempvec->begin(),zarray->begin())),
+			 thrust::make_zip_iterator(thrust::make_tuple(cuspstruct->tempvec->end(),zarray->end())),
+			 VecCUSPPlusEquals());
+      } else {
+	thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cuspstruct->tempvec->begin(),
+                             thrust::make_permutation_iterator(zarray->begin(), cuspstruct->indices->begin()))),
+			 thrust::make_zip_iterator(thrust::make_tuple(cuspstruct->tempvec->end(),
+                             thrust::make_permutation_iterator(zarray->end(),cuspstruct->indices->end()))),
+			 VecCUSPPlusEquals());
+      }
     }
     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);

File src/vec/vec/examples/tests/ex44.c

+#include "petscvec.h"
+
+static char help[] = "Tests vecScatter Sequential to Sequential for (CUSP) vectors\n\
+  -m # : the size of the vectors\n					\
+  -n # : the numer of indices (with n<=m)\n				\
+  -toFirst # : the starting index of the output vector for strided scatters\n \
+  -toStep # : the step size into the output vector for strided scatters\n \
+  -fromFirst # : the starting index of the input vector for strided scatters\n\
+  -fromStep # : the step size into the input vector for strided scatters\n\n";
+
+int main(int argc, char * argv[]) {
+
+  Vec            X,Y;
+  PetscInt       m,n,i,n1,n2;
+  PetscInt       toFirst,toStep,fromFirst,fromStep;
+  PetscInt       *idx,*idy;
+  PetscBool      flg;
+  IS             toISStrided,fromISStrided,toISGeneral,fromISGeneral;
+  VecScatter     vscatSStoSS,vscatSStoSG,vscatSGtoSS,vscatSGtoSG;
+  ScatterMode    mode;
+  InsertMode     addv;
+  PetscErrorCode ierr;
+
+  ierr = PetscInitialize(&argc,&argv,0,help);CHKERRQ(ierr);
+  ierr = PetscOptionsGetInt(NULL,"-m",&m,&flg);CHKERRQ(ierr);
+  if (!flg) m = 100;
+
+  ierr = PetscOptionsGetInt(NULL,"-n",&n,&flg);CHKERRQ(ierr);
+  if (!flg) n = 30;
+
+  ierr = PetscOptionsGetInt(NULL,"-toFirst",&toFirst,&flg);CHKERRQ(ierr);
+  if (!flg) toFirst = 3;
+
+  ierr = PetscOptionsGetInt(NULL,"-toStep",&toStep,&flg);CHKERRQ(ierr);
+  if (!flg) toStep = 3;
+
+  ierr = PetscOptionsGetInt(NULL,"-fromFirst",&fromFirst,&flg);CHKERRQ(ierr);
+  if (!flg) fromFirst = 2;
+
+  ierr = PetscOptionsGetInt(NULL,"-fromStep",&fromStep,&flg);CHKERRQ(ierr);
+  if (!flg) fromStep = 2;
+
+  if (n>m) {
+    PetscPrintf(PETSC_COMM_WORLD,"The vector sizes are %d. The number of elements being scattered is %d\n",m,n);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"Adjust the parameters such that m>=n\n");CHKERRQ(ierr);
+  } else if (toFirst+(n-1)*toStep >=m) {
+    PetscPrintf(PETSC_COMM_WORLD,"The vector sizes are %d. The number of elements being scattered is %d\n",m,n);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"For the Strided Scatter, toFirst=%d and toStep=%d.\n",toFirst,toStep);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"This produces an index (toFirst+(n-1)*toStep)>=m\n");CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"Adjust the parameterrs accordingly with -m, -n, -toFirst, or -toStep\n");CHKERRQ(ierr);
+  } else if (fromFirst+(n-1)*fromStep>=m) {
+    PetscPrintf(PETSC_COMM_WORLD,"The vector sizes are %d. The number of elements being scattered is %d\n",m,n);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"For the Strided Scatter, fromFirst=%d and fromStep=%d.\n",fromFirst,toStep);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"This produces an index (fromFirst+(n-1)*fromStep)>=m\n");CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"Adjust the parameterrs accordingly with -m, -n, -fromFirst, or -fromStep\n");CHKERRQ(ierr);
+  } else {
+    PetscPrintf(PETSC_COMM_WORLD,"m=%d\tn=%d\tfromFirst=%d\tfromStep=%d\ttoFirst=%d\ttoStep=%d\n",m,n,fromFirst,fromStep,toFirst,toStep);CHKERRQ(ierr);
+    PetscPrintf(PETSC_COMM_WORLD,"fromFirst+(n-1)*fromStep=%d\ttoFirst+(n-1)*toStep=%d\n",fromFirst+(n-1)*fromStep,toFirst+(n-1)*toStep);CHKERRQ(ierr);
+
+    /* Build the vectors */
+    ierr = VecCreate(PETSC_COMM_WORLD,&Y);CHKERRQ(ierr);
+    ierr = VecSetSizes(Y,m,PETSC_DECIDE);CHKERRQ(ierr);
+    ierr = VecCreate(PETSC_COMM_WORLD,&X);CHKERRQ(ierr);
+    ierr = VecSetSizes(X,m,PETSC_DECIDE);CHKERRQ(ierr);
+
+    ierr = VecSetFromOptions(Y);CHKERRQ(ierr);
+    ierr = VecSetFromOptions(X);CHKERRQ(ierr);
+    ierr = VecSet(X,2.0);CHKERRQ(ierr);
+    ierr = VecSet(Y,1.0);CHKERRQ(ierr);
+    
+    /* Build the strided index sets */
+    ierr = ISCreate(PETSC_COMM_WORLD,&toISStrided);CHKERRQ(ierr);
+    ierr = ISCreate(PETSC_COMM_WORLD,&fromISStrided);CHKERRQ(ierr);
+    ierr = ISSetType(toISStrided, ISSTRIDE);CHKERRQ(ierr);
+    ierr = ISSetType(fromISStrided, ISSTRIDE);CHKERRQ(ierr);
+    ierr = ISStrideSetStride(fromISStrided,n,fromFirst,fromStep);CHKERRQ(ierr);
+    ierr = ISStrideSetStride(toISStrided,n,toFirst,toStep);CHKERRQ(ierr);
+    
+    /* Build the general index sets */
+    ierr = PetscMalloc(n*sizeof(PetscInt),&idx);CHKERRQ(ierr);
+    ierr = PetscMalloc(n*sizeof(PetscInt),&idy);CHKERRQ(ierr);
+    for (i=0; i<n; i++) {
+      idx[i] = i % m;
+      idy[i] = (i+m) % m;
+    }
+    n1 = n;
+    n2 = n;
+    ierr = PetscSortRemoveDupsInt(&n1,idx);CHKERRQ(ierr);
+    ierr = PetscSortRemoveDupsInt(&n2,idy);CHKERRQ(ierr);
+
+    ierr = ISCreateGeneral(PETSC_COMM_WORLD,n1,idx,PETSC_COPY_VALUES,&toISGeneral);CHKERRQ(ierr);
+    ierr = ISCreateGeneral(PETSC_COMM_WORLD,n2,idy,PETSC_COPY_VALUES,&fromISGeneral);CHKERRQ(ierr);
+    
+    /* set the mode and the insert/add parameter */
+    mode = SCATTER_FORWARD;
+    addv = ADD_VALUES;
+    
+    /* VecScatter : Seq Strided to Seq Strided */
+    ierr = VecScatterCreate(X,fromISStrided,Y,toISStrided,&vscatSStoSS);CHKERRQ(ierr);
+    ierr = VecScatterBegin(vscatSStoSS,X,Y,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterEnd(vscatSStoSS,X,Y,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterDestroy(&vscatSStoSS);CHKERRQ(ierr);
+    
+    /* VecScatter : Seq General to Seq Strided */
+    ierr = VecScatterCreate(Y,fromISGeneral,X,toISStrided,&vscatSGtoSS);CHKERRQ(ierr);
+    ierr = VecScatterBegin(vscatSGtoSS,Y,X,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterEnd(vscatSGtoSS,Y,X,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterDestroy(&vscatSGtoSS);CHKERRQ(ierr);
+    
+    /* VecScatter : Seq General to Seq General */
+    ierr = VecScatterCreate(X,fromISGeneral,Y,toISGeneral,&vscatSGtoSG);CHKERRQ(ierr);
+    ierr = VecScatterBegin(vscatSGtoSG,X,Y,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterEnd(vscatSGtoSG,X,Y,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterDestroy(&vscatSGtoSG);CHKERRQ(ierr);
+    
+    /* VecScatter : Seq Strided to Seq General */
+    ierr = VecScatterCreate(Y,fromISStrided,X,toISGeneral,&vscatSStoSG);CHKERRQ(ierr);
+    ierr = VecScatterBegin(vscatSStoSG,Y,X,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterEnd(vscatSStoSG,Y,X,addv,mode);CHKERRQ(ierr);
+    ierr = VecScatterDestroy(&vscatSStoSG);CHKERRQ(ierr);
+    
+    /* view the results */
+    ierr = VecView(Y,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);
+    
+    /* Cleanup */
+    ierr = VecDestroy(&X);CHKERRQ(ierr);
+    ierr = VecDestroy(&Y);CHKERRQ(ierr);
+    ierr = ISDestroy(&toISStrided);CHKERRQ(ierr);
+    ierr = ISDestroy(&fromISStrided);CHKERRQ(ierr);
+    ierr = ISDestroy(&toISGeneral);CHKERRQ(ierr);
+    ierr = ISDestroy(&fromISGeneral);CHKERRQ(ierr);
+    ierr = PetscFree(idx);CHKERRQ(ierr);
+    ierr = PetscFree(idy);CHKERRQ(ierr);
+  }
+  PetscFinalize();
+
+  return 0;
+}

File src/vec/vec/examples/tests/makefile

 	-${CLINKER} -o ex43 ex43.o ${PETSC_VEC_LIB}
 	${RM} -f ex43.o
 
+ex44: ex44.o  chkopts
+	-${CLINKER} -o ex44 ex44.o ${PETSC_VEC_LIB}
+	${RM} -f ex44.o
+
 #--------------------------------------------------------------------------
 runex1:
 	-@${MPIEXEC} -n 1 ./ex1 > ex1_1.tmp 2>&1;\
 	   ${DIFF} output/ex43_1.out ex43_1.tmp || echo  ${PWD} "\nPossible problem with ex43, diffs above \n========================================="; \
 	   ${RM} -f ex43_1.tmp
 
+runex44:
+	-@${MPIEXEC} -n 1 ./ex44 -vec_type cusp > ex44.tmp 2>&1;\
+	   ${DIFF} output/ex44.out ex44.tmp || echo  ${PWD} "\nPossible problem with ex44, diffs above \n========================================="; \
+	   ${RM} -f ex44.tmp
+
 TESTEXAMPLES_C		    = ex1.PETSc runex1 ex1.rm ex2.PETSc runex2 ex2.rm ex3.PETSc runex3 ex3.rm \
                               ex4.PETSc runex4 ex4.rm ex5.PETSc ex5.rm ex6.PETSc runex6 ex6.rm ex7.PETSc \
                               runex7 ex7.rm ex8.PETSc runex8 ex8.rm ex9.PETSc runex9 ex9.rm ex11.PETSc runex11 \
 TESTEXAMPLES_FORTRAN_MPIUNI = ex19f.PETSc runex19f ex19f.rm
 TESTEXAMPLES_C_X_MPIUNI   = ex2.PETSc runex2 ex2.rm ex4.PETSc ex4.rm ex6.PETSc runex6 ex6.rm ex7.PETSc runex7 \
                               ex7.rm ex8.PETSc runex8 ex8.rm ex14.PETSc ex14.rm
+TESTEXAMPLES_CUSP           = ex44.PETSc runex44 ex44.rm
 
 include ${PETSC_DIR}/conf/test

File src/vec/vec/examples/tests/output/ex44.out

+m=100	n=30	fromFirst=2	fromStep=2	toFirst=3	toStep=3
+fromFirst+(n-1)*fromStep=60	toFirst+(n-1)*toStep=90
+Vec Object: 1 MPI processes
+  type: seqcusp
+3
+3
+3
+6
+3
+3
+6
+3
+3
+6
+3
+3
+8
+3
+3
+6
+3
+3
+6
+3
+3
+8
+3
+3
+6
+3
+3
+6
+3
+3
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+3
+1
+1
+1
+1
+1
+1
+1
+1
+1

File src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h

 PETSC_INTERN PetscErrorCode VecCUSPCopyToGPU_Public(Vec);
 PETSC_INTERN PetscErrorCode VecCUSPAllocateCheck_Public(Vec);
 
-struct  _p_PetscCUSPIndices {
-  PetscInt ns;
-  PetscInt sendLowestIndex;
-  PetscInt nr;
-  PetscInt recvLowestIndex;
-};
-
 #define CHKERRCUSP(err) if (((int)err) != (int)CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error %d",err)
 
 #define VecCUSPCastToRawPtr(x) thrust::raw_pointer_cast(&(x)[0])
   PetscBool hostDataRegisteredAsPageLocked;
 };
 
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt, PetscInt*,PetscInt, PetscInt*,PetscCUSPIndices*);
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt,PetscInt,PetscInt,PetscInt,PetscInt,PetscInt*,PetscInt*,PetscCUSPIndices*);
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices*);
+PETSC_INTERN PetscErrorCode VecScatterCUSP_StoS(Vec,Vec,PetscCUSPIndices,InsertMode,ScatterMode);
+
+typedef enum {VEC_SCATTER_CUSP_STOS, VEC_SCATTER_CUSP_PTOP} VecCUSPScatterType;
+typedef enum {VEC_SCATTER_CUSP_GENERAL, VEC_SCATTER_CUSP_STRIDED} VecCUSPSequentialScatterMode;
+
+struct  _p_VecScatterCUSPIndices_PtoP {
+  PetscInt ns;
+  PetscInt sendLowestIndex;
+  PetscInt nr;
+  PetscInt recvLowestIndex;
+};
+
+struct  _p_VecScatterCUSPIndices_StoS {
+  /* from indices data */
+  PetscInt *fslots;
+  PetscInt fromFirst;
+  PetscInt fromStep;
+  VecCUSPSequentialScatterMode fromMode;
+
+  /* to indices data */
+  PetscInt *tslots;
+  PetscInt toFirst;
+  PetscInt toStep;
+  VecCUSPSequentialScatterMode toMode;
+
+  PetscInt n;
+  PetscInt MAX_BLOCKS;
+  PetscInt MAX_CORESIDENT_THREADS;
+  cudaStream_t stream;
+};
+
+struct  _p_PetscCUSPIndices {
+  void * scatter;
+  VecCUSPScatterType scatterType;
+};
+
 #endif

File src/vec/vec/impls/seq/seqcusp/makefile

 
 CFLAGS   =
 FFLAGS   =
-SOURCECU = veccusp.cu
+SOURCECU = veccusp.cu vecscattercusp.cu
 SOURCEF  =
 SOURCEH  = cuspvecimpl.h
 LIBBASE  = libpetscvec

File src/vec/vec/impls/seq/seqcusp/veccusp.cu

   PetscScalar    *cpuPtr, *gpuPtr;
   cudaStream_t   stream;
   Vec_Seq        *s;
+  VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
 
   PetscFunctionBegin;
   ierr = VecCUSPAllocateCheck(v);CHKERRQ(ierr);
 
     ierr   = PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);CHKERRQ(ierr);
     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
-    gpuPtr = varray->data().get() + ci->recvLowestIndex;
-    cpuPtr = s->array + ci->recvLowestIndex;
+    gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
+    cpuPtr = s->array + ptop_scatter->recvLowestIndex;
 
     /* Note : this code copies the smallest contiguous chunk of data
        containing ALL of the indices */
-    err = cudaMemcpyAsync(gpuPtr, cpuPtr, ci->nr*sizeof(PetscScalar),
+    err = cudaMemcpyAsync(gpuPtr, cpuPtr, ptop_scatter->nr*sizeof(PetscScalar),
                            cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
 
   PetscScalar    *cpuPtr, *gpuPtr;
   cudaStream_t   stream;
   Vec_Seq        *s;
+  VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
 
   PetscFunctionBegin;
   ierr = VecCUSPAllocateCheckHost(v);CHKERRQ(ierr);
     stream=((Vec_CUSP*)v->spptr)->stream;
     varray=((Vec_CUSP*)v->spptr)->GPUarray;
     s = (Vec_Seq*)v->data;
-    gpuPtr = varray->data().get() + ci->sendLowestIndex;
-    cpuPtr = s->array + ci->sendLowestIndex;
+    gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
+    cpuPtr = s->array + ptop_scatter->sendLowestIndex;
 
     /* Note : this code copies the smallest contiguous chunk of data
        containing ALL of the indices */
-    err = cudaMemcpyAsync(cpuPtr, gpuPtr, ci->ns*sizeof(PetscScalar),
+    err = cudaMemcpyAsync(cpuPtr, gpuPtr, ptop_scatter->ns*sizeof(PetscScalar),
 			   cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
 
   PetscFunctionReturn(0);
 }
 
-
 #undef __FUNCT__
 #define __FUNCT__ "VecCopy_SeqCUSP_Private"
 static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
 }
 
 
-#undef __FUNCT__
-#define __FUNCT__ "PetscCUSPIndicesCreate"
-/*
-    PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()
-
-   Input Parameters:
-+    n - the number of indices
--    indices - integer list of indices
-
-   Output Parameter:
-.    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
-
-.seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
-*/
-PetscErrorCode PetscCUSPIndicesCreate(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
-{
-  PetscCUSPIndices cci;
-  PetscFunctionBegin;
-  cci = new struct _p_PetscCUSPIndices;
-  /* this calculation assumes that the input indices are sorted */
-  cci->ns = sendIndices[ns-1]-sendIndices[0]+1;
-  cci->sendLowestIndex = sendIndices[0];
-  cci->nr = recvIndices[nr-1]-recvIndices[0]+1;
-  cci->recvLowestIndex = recvIndices[0];
-  *ci = cci;
-  PetscFunctionReturn(0);
-}
-
-#undef __FUNCT__
-#define __FUNCT__ "PetscCUSPIndicesDestroy"
-/*
-    PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()
-
-   Input Parameters:
-.    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
-
-.seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
-*/
-PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
-{
-  PetscFunctionBegin;
-  if (!(*ci)) PetscFunctionReturn(0);
-  try {
-    if (ci) delete *ci;
-  } catch(char *ex) {
-    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
-  }
-  *ci = 0;
-  PetscFunctionReturn(0);
-}
-
 
 #undef __FUNCT__
 #define __FUNCT__ "VecCUSPCopyToGPUSome_Public"

File src/vec/vec/impls/seq/seqcusp/vecscattercusp.cu

+/*
+   Implements the various scatter operations on cusp vectors
+*/
+
+#include <petscconf.h>
+PETSC_CUDA_EXTERN_C_BEGIN
+#include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
+#include <../src/vec/vec/impls/dvecimpl.h>
+PETSC_CUDA_EXTERN_C_END
+#include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
+
+#include <cuda_runtime.h>
+
+#undef __FUNCT__
+#define __FUNCT__ "VecScatterCUSPIndicesCreate_StoS"
+PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {
+
+  PetscCUSPIndices           cci;
+  VecScatterCUSPIndices_StoS stos_scatter;
+  cudaError_t                err = cudaSuccess;
+  cudaStream_t               stream;
+  PetscInt                   *intVecGPU;
+  int                        device;
+  cudaDeviceProp             props;
+
+  PetscFunctionBegin;
+  cci = new struct _p_PetscCUSPIndices;
+  stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;
+
+  /* create the "from" indices */
+  stos_scatter->fslots = 0;
+  stos_scatter->fromFirst = 0;
+  stos_scatter->fromStep = 0;
+  if (n) {
+    if (fslots) {
+      /* allocate GPU memory for the to-slots */
+      err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
+      err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
+
+      /* assign the pointer to the struct */
+      stos_scatter->fslots = intVecGPU;
+      stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
+    } else if (fromStep) {
+      stos_scatter->fromFirst = fromFirst;
+      stos_scatter->fromStep = fromStep;
+      stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
+    }
+  }
+
+  /* create the "to" indices */
+  stos_scatter->tslots = 0;
+  stos_scatter->toFirst = 0;
+  stos_scatter->toStep = 0;
+  if (n) {
+    if (tslots) {
+      /* allocate GPU memory for the to-slots */
+      err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
+      err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
+
+      /* assign the pointer to the struct */
+      stos_scatter->tslots = intVecGPU;
+      stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
+    } else if (toStep) {
+      stos_scatter->toFirst = toFirst;
+      stos_scatter->toStep = toStep;
+      stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
+    }
+  }
+
+  /* allocate the stream variable */
+  err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
+  stos_scatter->stream = stream;
+
+  /* the number of indices */
+  stos_scatter->n = n;
+
+  /* get the maximum number of coresident thread blocks */
+  cudaGetDevice(&device);
+  cudaGetDeviceProperties(&props, device);
+  stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
+  if (props.major>=3) {
+    stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
+  } else {
+    stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
+  }
+
+  /* assign the indices */
+  cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
+  cci->scatterType = VEC_SCATTER_CUSP_STOS;
+  *ci = cci;
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "VecScatterCUSPIndicesCreate_PtoP"
+PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
+{
+  PetscCUSPIndices           cci;
+  VecScatterCUSPIndices_PtoP ptop_scatter;
+
+  PetscFunctionBegin;
+  cci = new struct _p_PetscCUSPIndices;
+  ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;
+
+  /* this calculation assumes that the input indices are sorted */
+  ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
+  ptop_scatter->sendLowestIndex = sendIndices[0];
+  ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
+  ptop_scatter->recvLowestIndex = recvIndices[0];
+
+  /* assign indices */
+  cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
+  cci->scatterType = VEC_SCATTER_CUSP_PTOP;
+
+  *ci = cci;
+  PetscFunctionReturn(0);
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "VecScatterCUSPIndicesDestroy"
+PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
+{
+  PetscFunctionBegin;
+  if (!(*ci)) PetscFunctionReturn(0);
+  try {
+    if (ci) {
+      if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
+	delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
+	(*ci)->scatter = 0;
+      } else {
+	cudaError_t err = cudaSuccess;
+	VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
+	if (stos_scatter->fslots) {
+	  err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
+	  stos_scatter->fslots = 0;
+	}
+
+	/* free the GPU memory for the to-slots */
+	if (stos_scatter->tslots) {
+	  err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
+	  stos_scatter->tslots = 0;
+	}
+
+	/* free the stream variable */
+	if (stos_scatter->stream) {
+	  err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
+	  stos_scatter->stream = 0;
+	}
+	delete stos_scatter;
+	(*ci)->scatter = 0;
+      }
+      delete *ci;
+      *ci = 0;
+    }
+  } catch(char *ex) {
+    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
+  }
+  PetscFunctionReturn(0);
+}
+
+/* Insert operator */
+class Insert {
+ public:
+  __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
+    return a;
+  }
+};
+
+/* Add operator */
+class Add {
+ public:
+  __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
+    return a+b;
+  }
+};
+
+/* Add operator */
+class Max {
+ public:
+  __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
+#if !defined(PETSC_USE_COMPLEX)
+    return PetscMax(a,b);
+#endif
+  }
+};
+
+/* Sequential general to sequential general GPU kernel */
+template<class OPERATOR>
+__global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
+  const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
+  const int grid_size = gridDim.x * blockDim.x;
+  for (int i = tidx; i < n; i += grid_size) {
+    y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
+  }
+}
+
+/* Sequential general to sequential strided GPU kernel */
+template<class OPERATOR>
+__global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
+  const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
+  const int grid_size = gridDim.x * blockDim.x;
+  for (int i = tidx; i < n; i += grid_size) {
+    y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
+  }
+}
+
+/* Sequential strided to sequential strided GPU kernel */
+template<class OPERATOR>
+__global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
+  const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
+  const int grid_size = gridDim.x * blockDim.x;
+  for (int i = tidx; i < n; i += grid_size) {
+    y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
+  }
+}
+
+/* Sequential strided to sequential general GPU kernel */
+template<class OPERATOR>
+__global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
+  const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
+  const int grid_size = gridDim.x * blockDim.x;
+  for (int i = tidx; i < n; i += grid_size) {
+    y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
+  }
+}
+
+template<class OPERATOR>
+void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {
+
+  PetscInt                   nBlocks=0,nThreads=128;
+  VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
+
+  nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
+  if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
+    nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
+  }
+  dim3 block(nThreads,1,1);
+  dim3 grid(nBlocks,1,1);
+
+  if (mode == SCATTER_FORWARD) {
+    if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
+      VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
+    } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
+      VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
+    } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
+      VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
+    } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
+      VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
+    }
+  } else {
+    if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
+      VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
+    } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
+      VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
+    } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
+      VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
+    } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
+      VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
+    }
+  }
+}
+
+#undef __FUNCT__
+#define __FUNCT__ "VecScatterCUSP_StoS"
+PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
+{
+  PetscErrorCode             ierr;
+  CUSPARRAY                  *xarray,*yarray;
+  VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
+  cudaError_t                err = cudaSuccess;
+
+  PetscFunctionBegin;
+  ierr = VecCUSPAllocateCheck(x);CHKERRQ(ierr);
+  ierr = VecCUSPAllocateCheck(y);CHKERRQ(ierr);
+  ierr = VecCUSPGetArrayRead(x,&xarray);CHKERRQ(ierr);
+  ierr = VecCUSPGetArrayWrite(y,&yarray);CHKERRQ(ierr);
+  if (stos_scatter->n) {
+    if (addv == INSERT_VALUES)
+      VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
+    else if (addv == ADD_VALUES)
+      VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
+#if !defined(PETSC_USE_COMPLEX)
+    else if (addv == MAX_VALUES)
+      VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
+#endif
+    else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
+    err = cudaGetLastError();CHKERRCUSP((int)err);
+    err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
+  }
+  ierr = VecCUSPRestoreArrayRead(x,&xarray);CHKERRQ(ierr);
+  ierr = VecCUSPRestoreArrayWrite(y,&yarray);CHKERRQ(ierr);
+  PetscFunctionReturn(0);
+}

File src/vec/vec/utils/veccusp/vscatcusp.c

 #include <petsc-private/isimpl.h>
 #include <petsc-private/vecimpl.h>             /*I "petscvec.h" I*/
 
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt, PetscInt*,PetscInt, PetscInt*,PetscCUSPIndices*);
+
 #undef __FUNCT__
 #define __FUNCT__ "VecScatterInitializeForGPU"
 /*@
       ierr = PetscFree(tindicesRecvs);CHKERRQ(ierr);
 
       /* create GPU indices, work vectors, ... */
-      ierr = PetscCUSPIndicesCreate(ns*bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);CHKERRQ(ierr);
+      ierr = VecScatterCUSPIndicesCreate_PtoP(ns*bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);CHKERRQ(ierr);
       ierr = PetscFree(sindicesSends);CHKERRQ(ierr);
       ierr = PetscFree(sindicesRecvs);CHKERRQ(ierr);
     }

File src/vec/vec/utils/vpscat.c

   ierr = PetscFree4(from->values,from->indices,from->starts,from->procs);CHKERRQ(ierr);
   ierr = PetscFree(from);CHKERRQ(ierr);
   ierr = PetscFree(to);CHKERRQ(ierr);
-#if defined(PETSC_HAVE_CUSP)
-  ierr = PetscCUSPIndicesDestroy((PetscCUSPIndices*)&ctx->spptr);CHKERRQ(ierr);
-#endif
   PetscFunctionReturn(0);
 }
 

File src/vec/vec/utils/vpscat.h

         for (k=0; k<bs; k++) sindices[i*bs+k] = tindices[i]+k;
       }
       ierr = PetscFree(tindices);CHKERRQ(ierr);
-      ierr = PetscCUSPIndicesCreate(n*bs,sindices,n*bs,sindices,(PetscCUSPIndices*)&ctx->spptr);CHKERRQ(ierr);
+      ierr = VecScatterCUSPIndicesCreate_PtoP(n*bs,sindices,n*bs,sindices,(PetscCUSPIndices*)&ctx->spptr);CHKERRQ(ierr);
       ierr = PetscFree(sindices);CHKERRQ(ierr);
     }
     ierr = VecCUSPCopyFromGPUSome_Public(xin,(PetscCUSPIndices)ctx->spptr);CHKERRQ(ierr);

File src/vec/vec/utils/vscat.c

 
 #include <petsc-private/vecimpl.h>    /*I   "petscvec.h"    I*/
 
+#if defined(PETSC_HAVE_CUSP)
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt,PetscInt,PetscInt,PetscInt,PetscInt,PetscInt*,PetscInt*,PetscCUSPIndices*);
+PETSC_INTERN PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices*);
+PETSC_INTERN PetscErrorCode VecScatterCUSP_StoS(Vec,Vec,PetscCUSPIndices,InsertMode,ScatterMode);
+#endif
+
 /* Logging support */
 PetscClassId VEC_SCATTER_CLASSID;
 
   PetscScalar            *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt tofirst = 0,tostep = 0,fromfirst = 0,fromstep = 0;
+      fslots = gen_from->vslots;
+      tslots = gen_to->vslots;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,tofirst,fromfirst,tostep,fromstep,tslots,fslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArray(x,&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   PetscScalar            *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt tofirst = first,tostep = 1,fromfirst = 0,fromstep = 0;
+      PetscInt *tslots = 0;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,tofirst,fromfirst,tostep,fromstep,tslots,fslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArray(x,&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   PetscScalar            *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt tofirst = first,tostep = step,fromfirst = 0,fromstep = 0;
+      PetscInt * tslots = 0;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,tofirst,fromfirst,tostep,fromstep,tslots,fslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArray(x,&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   PetscScalar            *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt tofirst = 0,tostep = 0,fromfirst = first,fromstep = 1;
+      PetscInt * tslots = 0;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,tofirst,fromfirst,tostep,fromstep,tslots,fslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArray(x,&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   PetscScalar            *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt tofirst = 0,tostep = 0,fromfirst = first,fromstep = step;
+      PetscInt * tslots = 0;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,tofirst,fromfirst,tostep,fromstep,fslots,tslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArray(x,&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   PetscScalar           *xv,*yv;
 
   PetscFunctionBegin;
+#if defined(PETSC_HAVE_CUSP)
+  if (x->valid_GPU_array == PETSC_CUSP_GPU) {
+    /* create the scatter indices if not done already */
+    if (!ctx->spptr) {
+      PetscInt *tslots = 0,*fslots;
+      ierr = VecScatterCUSPIndicesCreate_StoS(n,to_first,from_first,to_step,from_step,tslots,fslots,(PetscCUSPIndices*)&(ctx->spptr));CHKERRQ(ierr);
+    }
+    /* next do the scatter */
+    ierr = VecScatterCUSP_StoS(x,y,(PetscCUSPIndices)ctx->spptr,addv,mode);
+    PetscFunctionReturn(0);
+  }
+#endif
+
   ierr = VecGetArrayRead(x,(const PetscScalar **)&xv);CHKERRQ(ierr);
   if (x != y) {
     ierr = VecGetArray(y,&yv);CHKERRQ(ierr);
   /* if memory was published with AMS then destroy it */
   ierr = PetscObjectAMSViewOff((PetscObject)(*ctx));CHKERRQ(ierr);
   ierr = (*(*ctx)->destroy)(*ctx);CHKERRQ(ierr);
+#if defined(PETSC_HAVE_CUSP)
+  ierr = VecScatterCUSPIndicesDestroy((PetscCUSPIndices*)&((*ctx)->spptr));CHKERRQ(ierr);
+#endif
   ierr = PetscHeaderDestroy(ctx);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }