Michael Lange avatar Michael Lange committed 2c8dacd Draft

Backing out the latest changes concerning task-based spMVM and latency hiding.

Comments (0)

Files changed (2)

src/mat/impls/aij/mpi/mpiaij.c

 {
   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
   PetscErrorCode ierr;
-  PetscInt       nt, tid;
+  PetscInt       nt;
 
   PetscFunctionBegin;
   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
-
-#if defined(PETSC_HAVE_OPENMP)
-  if (PetscGetMaxThreads() > 1) {
-    /* Thread 0 is dedicated to MPI comm to ensure effective latency hiding */
-#pragma omp parallel default(none) shared(a,A,xx,yy) private(tid,ierr)
-    {      
-      tid = PetscGetThreadNum();
-      if (tid > 0){
-	/* CHKERRQ is not yet thread-safe */
-	ierr = (*a->A->ops->mult)(a->A,xx,yy);//CHKERRQ(ierr);
-      } else {
-	ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);//CHKERRQ(ierr);
-	ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);//CHKERRQ(ierr);
-      }
-#pragma omp barrier
-      if (tid > 0){
-	ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);//CHKERRQ(ierr);
-      }
-    }
-  } else {
-    ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
-    ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
-    ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
-    ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
-  }    
-#else
   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
-#endif
   PetscFunctionReturn(0);
 }
 

src/mat/impls/aij/seq/aij.c

   PetscFunctionReturn(0);
 }
 
-#undef __FUNCT__
-#define __FUNCT__ "MatMultThreadPartition"
-PetscErrorCode MatMultThreadPartition(PetscInt m, PetscInt *start, PetscInt *end)
-{
-  PetscFunctionBegin;
-#ifdef _OPENMP
-  /* Thread 0 is dedicated to MPI comm */
-  PetscInt tid = PetscGetThreadNum() - 1;
-  PetscInt nthreads = PetscGetNumThreads() - 1;
-  PetscInt nblock;
-  
-  if (PetscGetMaxThreads() > 1) {
-    if (nthreads > 0) {
-      nblock = m / nthreads;
-      if (m % nthreads > 0) nblock++;
-    } else {
-    nblock = m;
-    }
-    *start = tid * nblock;
-    *end = *start + nblock;
-    if (tid == nthreads-1) *end = m;
-  } else {
-    *start = 0;
-    *end = m;
-  }    
-#else
-  *start = 0;
-  *end = m;
-#endif
-  PetscFunctionReturn(0);
-}
-
 #include <../src/mat/impls/aij/seq/ftn-kernels/fmult.h>
 #undef __FUNCT__
 #define __FUNCT__ "MatMult_SeqAIJ"
   PetscErrorCode    ierr;
   PetscInt          m=A->rmap->n;
   const PetscInt    *aj,*ii,*ridx=PETSC_NULL;
-  PetscInt          n,i,nonzerorow=0,nonzerorow_local=0,start, end;
+  PetscInt          n,i,nonzerorow=0;
   PetscScalar       sum;
   PetscBool         usecprow=a->compressedrow.use;
 
   aj  = a->j;
   aa  = a->a;
   ii  = a->i;
-  MatMultThreadPartition(m, &start, &end);
-
   if (usecprow){ /* use compressed row format */
     m    = a->compressedrow.nrows;
     ii   = a->compressedrow.i;
     ridx = a->compressedrow.rindex;
-    for (i=start; i<end; i++){
+#pragma omp parallel for schedule(static) default(none) private(i, n, aj, aa, sum) shared(m, ii, a, x, y, ridx) reduction(+:nonzerorow)
+    for (i=0; i<m; i++){
       n   = ii[i+1] - ii[i]; 
       aj  = a->j + ii[i];
       aa  = a->a + ii[i];
       sum = 0.0;
-      nonzerorow_local += (n>0);
+      nonzerorow += (n>0);
       PetscSparseDensePlusDot(sum,x,aa,aj,n); 
       /* for (j=0; j<n; j++) sum += (*aa++)*x[*aj++]; */
       y[ridx[i]] = sum;
 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJ)
     fortranmultaij_(&m,x,ii,aj,aa,y);
 #else
-    for (i=start; i<end; i++) {
+#pragma omp parallel for schedule(static) default(none) private(i,n,aj,aa,sum) shared(m,ii,a,x,y) reduction(+:nonzerorow)
+    for (i=0; i<m; i++) {
       n   = ii[i+1] - ii[i];
       aj  = a->j + ii[i];
       aa  = a->a + ii[i];
       sum  = 0.0;
-      nonzerorow_local += (n>0); 
+      nonzerorow += (n>0); 
       PetscSparseDensePlusDot(sum,x,aa,aj,n);
       y[i] = sum;
     }
 #endif
   }
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz - nonzerorow);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   PetscFunctionReturn(0);
   const MatScalar   *aa;
   PetscErrorCode    ierr;
   PetscInt          m = A->rmap->n,*aj,*ii;
-  PetscInt          n,i,*ridx=PETSC_NULL,start,end;
+  PetscInt          n,i,*ridx=PETSC_NULL;
   PetscScalar       sum;
   PetscBool         usecprow=a->compressedrow.use;
 
   aj  = a->j;
   aa  = a->a;
   ii  = a->i;
-  MatMultThreadPartition(m, &start, &end);
-
   if (usecprow){ /* use compressed row format */
     if (zz != yy){
       ierr = PetscMemcpy(z,y,m*sizeof(PetscScalar));CHKERRQ(ierr);
     m    = a->compressedrow.nrows;
     ii   = a->compressedrow.i;
     ridx = a->compressedrow.rindex;
-    for (i=start; i<end; i++){
+#pragma omp parallel for schedule(static) default(none) private(i, n, aj, aa, sum) shared(m, ii, a, y, ridx, z, x)
+    for (i=0; i<m; i++){
       n  = ii[i+1] - ii[i];
       aj  = a->j + ii[i];
       aa  = a->a + ii[i];
 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJ)
   fortranmultaddaij_(&m,x,ii,aj,aa,y,z);
 #else
-    for (i=start; i<end; i++) {
+#pragma omp parallel for schedule(static) default(none) private(i, n, aj, aa, sum) shared(m, ii, a, y, z, x)
+    for (i=0; i<m; i++) {
       n    = ii[i+1] - ii[i];
       aj  = a->j + ii[i];
       aa  = a->a + ii[i];
     }
 #endif
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
   PetscInt       i;
   PetscInt       m;
 #if defined(PETSC_HAVE_OPENMP)
-  PetscInt       j,start,end;
+  PetscInt       j;
 #endif
 
   PetscFunctionBegin;
     b->free_ij      = PETSC_TRUE;
 
 #if defined(PETSC_HAVE_OPENMP)
-  /* page by matrix and column indices by rows */
-#pragma omp parallel default(none) private(i, j, start, end) shared(b, m)
-    if (PetscGetThreadNum() > 0) {
-      MatMultThreadPartition(m, &start, &end);
-      for (i=start; i < end; i++ ) {
-	for(j=b->i[i]; j<b->i[i+1]; j++) {
-	  b->a[j] = (PetscScalar)0.0;
-	  b->j[j] = (PetscScalar)0.0;
-	}
+#pragma omp parallel for schedule(static) default(none) private(i, j) shared(b, m)
+    /* page by matrix and column indices by rows */
+    for (i=0; i < m; i++ ) {
+      for(j=b->i[i]; j<b->i[i+1]; j++) {
+	b->a[j] = (PetscScalar)0.0;
+	b->j[j] = (PetscScalar)0.0;
       }
     }
 #endif
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.