Commits

Michael Lange  committed 054e50b Draft

Final corrections:
* Another bugfix for MatThreadPartition
* Fixing the Flops logging
* Adding (void*) casts to interface guards

  • Participants
  • Parent commits ea11b1e

Comments (0)

Files changed (4)

File include/petsc-private/matimpl.h

 
   if (nthread > 0 && nz > 0) {
     chunk = nz / nthread;
-    if (nz % nthread > 0) chunk++;
     A->thread_start[0] = 0;
     t = 1;
     for (i=0; i<m+1; i++){

File src/mat/impls/aij/seq/aij.c

     }
 #endif
   }
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz - nonzerorow);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
+  ierr = PetscLogFlops(2.0*(ii[end] - ii[start]) - nonzerorow);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     }
 #endif
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(2.0*(ii[end] - ii[start]));CHKERRQ(ierr);
 #if defined(PETSC_HAVE_CUSP)
   /*
   ierr = VecView(xx,0);CHKERRQ(ierr);

File src/mat/impls/baij/seq/baij2.c

   const PetscScalar *x;
   const MatScalar   *v;
   PetscErrorCode    ierr;
-  PetscInt          mbs,i,n,z_idx,nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          mbs,i,n,z_idx;
+  PetscInt          nonzerorow=0,start,end,thread;
   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
   PetscBool         usecprow=a->compressedrow.use;
 
     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     sum  = 0.0;
     PetscSparseDensePlusDot(sum,x,v,idx,n);
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     z[z_idx] = sum;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz - nonzerorow);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
+  ierr = PetscLogFlops(2.0*(ii[end]-ii[start]) - nonzerorow);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
   const MatScalar   *v;
   PetscErrorCode    ierr;
   PetscInt          i,*idx,*ii,j,n,*ridx=PETSC_NULL,z_idx;
-  PetscInt          nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
 
   PetscFunctionBegin;
     sum1 = 0.0; sum2 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     }
     z[2*z_idx] = sum1; z[2*z_idx+1] = sum2;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
+  ierr = PetscLogFlops(8.0*(ii[end]-ii[start]) - 2.0*nonzerorow);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
   const MatScalar   *v;
   PetscErrorCode    ierr;
   PetscInt          i,*idx,*ii,j,n,*ridx=PETSC_NULL,z_idx;
-  PetscInt          nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
   
 
     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     }
     z[3*z_idx] = sum1; z[3*z_idx+1] = sum2; z[3*z_idx+2] = sum3;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(18.0*(ii[end]-ii[start]) - 3.0*nonzerorow);CHKERRQ(ierr);
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   PetscFunctionReturn(0);
   const MatScalar   *v;
   PetscErrorCode    ierr;
   PetscInt          i,*idx,*ii,j,n,*ridx=PETSC_NULL,z_idx;
-  PetscInt          nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
 
   PetscFunctionBegin;
     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     }
     z[4*z_idx] = sum1; z[4*z_idx+1] = sum2; z[4*z_idx+2] = sum3; z[4*z_idx+3] = sum4;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
-  ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
+  ierr = PetscLogFlops(32.0*(ii[end]-ii[start]) - 4.0*nonzerorow);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
   const MatScalar   *v;
   PetscErrorCode    ierr;
   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
-  PetscInt          i,j,n,z_idx,nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          i,j,n,z_idx,nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
 
   PetscFunctionBegin;
     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     z[5*z_idx] = sum1; z[5*z_idx+1] = sum2; z[5*z_idx+2] = sum3;
     z[5*z_idx+3] = sum4; z[5*z_idx+4] = sum5;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(50.0*(ii[end]-ii[start]) - 5.0*nonzerorow);CHKERRQ(ierr);
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   PetscFunctionReturn(0);
   const MatScalar   *v;
   PetscErrorCode    ierr;
   PetscInt          i,*idx,*ii,j,n,*ridx=PETSC_NULL,z_idx;
-  PetscInt          nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
 
   PetscFunctionBegin;
     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     z[6*z_idx]   = sum1; z[6*z_idx+1] = sum2; z[6*z_idx+2] = sum3; 
     z[6*z_idx+3] = sum4; z[6*z_idx+4] = sum5; z[6*z_idx+5] = sum6;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(72.0*(ii[end]-ii[start]) - 6.0*nonzerorow);CHKERRQ(ierr);
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   PetscFunctionReturn(0);
   const MatScalar   *v;
   PetscErrorCode    ierr;
   PetscInt          i,*idx,*ii,j,n,*ridx=PETSC_NULL,z_idx;
-  PetscInt          nonzerorow=0,nonzerorow_local=0,start,end,thread;
+  PetscInt          nonzerorow=0,start,end,thread;
   PetscBool         usecprow=a->compressedrow.use;
 
   PetscFunctionBegin;
     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
     z_idx = i;
     if (usecprow) z_idx = ridx[i];
-    nonzerorow_local += (n>0);
+    nonzerorow += (n>0);
     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
     for (j=0; j<n; j++) {
     z[7*z_idx] = sum1; z[7*z_idx+1] = sum2; z[7*z_idx+2] = sum3; z[7*z_idx+3] = sum4; 
     z[7*z_idx+4] = sum5; z[7*z_idx+5] = sum6; z[7*z_idx+6] = sum7;
   }
-
-  PetscPragmaOMP(atomic) nonzerorow += nonzerorow_local;
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
+  ierr = PetscLogFlops(98.0*(ii[end]-ii[start]) - 7.0*nonzerorow);CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     z[z_idx] = sum;
   }
 
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(2.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(2.0*(ii[end]-ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     }
     z[z_idx] = sum1; z[z_idx+1] = sum2;
   }
-
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(4.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(8.0*(ii[end]-ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     }
     z[3*z_idx] = sum1; z[3*z_idx+1] = sum2; z[3*z_idx+2] = sum3;
   }
-
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(18.0*a->nz);//CHKERRQ(ierr);
-  }
+  ierr = PetscLogFlops(18.0*(ii[end] - ii[start]));CHKERRQ(ierr);
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     }
     z[4*z_idx] = sum1; z[4*z_idx+1] = sum2; z[4*z_idx+2] = sum3; z[4*z_idx+3] = sum4;
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(32.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(32.0*(ii[end] - ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     z[z_idx] = sum1; z[z_idx+1] = sum2; z[z_idx+2] = sum3; 
     z[z_idx+3] = sum4; z[z_idx+4] = sum5;
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(50.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(50.0*(ii[end] - ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 #undef __FUNCT__  
     z[z_idx] = sum1;   z[z_idx+1] = sum2; z[z_idx+2] = sum3; 
     z[z_idx+3] = sum4; z[z_idx+4] = sum5; z[z_idx+5] = sum6;
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(72.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(72.0*(ii[end] - ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 
     z[z_idx] = sum1;   z[z_idx+1] = sum2; z[z_idx+2] = sum3; z[z_idx+3] = sum4; 
     z[z_idx+4] = sum5; z[z_idx+5] = sum6; z[z_idx+6] = sum7;
   }
-  PetscPragmaOMP(single nowait) {
-    ierr = PetscLogFlops(98.0*a->nz);//CHKERRQ(ierr);
-  }
   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
   if (zz != yy) {
     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
   }
+  ierr = PetscLogFlops(98.0*(ii[end] - ii[start]));CHKERRQ(ierr);
   PetscFunctionReturn(0);
 }
 

File src/mat/interface/matrix.c

      implementations are actually executed in parallel using this interface.
      To make this safer and more elegant some re-engineering is required.
   */
-  if (mat->ops->mult == MatMult_SeqAIJ) threaded++;
-  if (mat->ops->mult == MatMult_MPIAIJ) threaded++;
-  if (mat->ops->mult == MatMult_SeqBAIJ_1) threaded++;
-  if (mat->ops->mult == MatMult_SeqBAIJ_3) threaded++;
-  if (mat->ops->mult == MatMult_SeqBAIJ_4) threaded++;
-  if (mat->ops->mult == MatMult_MPIBAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_MPIAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_MPIBAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_1) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_2) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_3) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_4) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_5) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_6) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMult_SeqBAIJ_7) threaded++;
 #pragma omp parallel default(none) shared(mat,x,y) if (threaded)
   {
     (*mat->ops->mult)(mat,x,y);//CHKERRQ(ierr);
      implementations are actually executed in parallel using this interface.
      To make this safer and more elegant some re-engineering is required.
   */
-  if (mat->ops->mult == MatMultAdd_SeqAIJ) threaded++;
-  if (mat->ops->mult == MatMultAdd_MPIAIJ) threaded++;
-  if (mat->ops->mult == MatMultAdd_SeqBAIJ_1) threaded++;
-  if (mat->ops->mult == MatMultAdd_SeqBAIJ_3) threaded++;
-  if (mat->ops->mult == MatMultAdd_SeqBAIJ_4) threaded++;
-  if (mat->ops->mult == MatMultAdd_MPIBAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_MPIAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_MPIBAIJ) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_1) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_2) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_3) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_4) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_5) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_6) threaded++;
+  if ((void*) mat->ops->mult == (void*) MatMultAdd_SeqBAIJ_7) threaded++;
 #pragma omp parallel default(none) shared(mat,v1,v2,v3)
   {
     (*mat->ops->multadd)(mat,v1,v2,v3);//CHKERRQ(ierr);