Commits

Michael Lange committed 80533da Draft

Making NZ-balancing optional (option: -matmult_nz_balance):
* Adding an even contiguous-row thread partitioning scheme for N-1 threads
* Explicit NZ-based thread balancing is activated via the run-time option: -matmult_nz_balance

  • Participants
  • Parent commits eacc412

Comments (0)

Files changed (1)

File include/petsc-private/matimpl.h

 #define __FUNCT__ "MatThreadPartition"
 PETSC_STATIC_INLINE PetscErrorCode MatThreadPartition(Mat A, PetscInt m, PetscInt nz, const PetscInt *ii)
 {
-  PetscInt          i, t, nthread, chunk, my_nz, new_nz;
-  PetscBool         improved=PETSC_TRUE;
+  PetscInt          i, t, nthread, chunk, my_nz, new_nz, ierr;
+  PetscBool         flg,nz_balance=PETSC_FALSE,improved=PETSC_TRUE;
 
   PetscFunctionBegin;
 #if defined(PETSC_HAVE_OPENMP)
   nthread = 1;
 #endif
 
+  ierr = PetscOptionsGetBool(PETSC_NULL,"-matmult_nz_balance",&nz_balance, &flg);CHKERRQ(ierr);
+
   if (nthread > 1 ) {
-    // N-1 worker threads
-    chunk = nz / (nthread-1);
-    A->thread_start[0] = 0;
-    A->thread_end[0] = 0;
-    A->thread_start[1] = 0;
-    t = 2;
-    for (i=0; i<m+1; i++){
-      if (ii[i] >= (t*chunk) && t < nthread) {
-	A->thread_start[t] = i;
-	A->thread_end[t-1] = i;
-	t++; 
+    if (nz_balance && nz > 0) {
+      // N-1 worker threads
+      chunk = nz / (nthread-1);
+      A->thread_start[0] = 0;
+      A->thread_end[0] = 0;
+      A->thread_start[1] = 0;
+      t = 2;
+      for (i=0; i<m+1; i++){
+	if (ii[i] >= (t*chunk) && t < nthread) {
+	  A->thread_start[t] = i;
+	  A->thread_end[t-1] = i;
+	  t++; 
+	}
       }
-    }
-    A->thread_end[nthread-1] = m;
+      A->thread_end[nthread-1] = m;
 
-    /* Now apply local diffusion */
-    while(improved){
-      improved = PETSC_FALSE;
-      for(i=1; i<nthread; i++){
-	if(i < nthread-1){
-	  // Look ahead
-	  my_nz = ii[ A->thread_end[i] ] - ii[ A->thread_start[i] ];
-	  new_nz = ii[ A->thread_end[i+1] ] - ii[ A->thread_start[i+1]-1 ];
-	  if(new_nz < my_nz){
-	    A->thread_end[i] -= 1;
-	    A->thread_start[i+1] -= 1;
-	    improved = PETSC_TRUE;
+      /* Now apply local diffusion */
+      while(improved){
+	improved = PETSC_FALSE;
+	for(i=1; i<nthread; i++){
+	  if(i < nthread-1){
+	    // Look ahead
+	    my_nz = ii[ A->thread_end[i] ] - ii[ A->thread_start[i] ];
+	    new_nz = ii[ A->thread_end[i+1] ] - ii[ A->thread_start[i+1]-1 ];
+	    if(new_nz < my_nz){
+	      A->thread_end[i] -= 1;
+	      A->thread_start[i+1] -= 1;
+	      improved = PETSC_TRUE;
+	    }
 	  }
-	}
-	if(i > 1){
-	  // Look back
-	  my_nz = ii[ A->thread_end[i] ] - ii[ A->thread_start[i] ];
-	  new_nz = ii[ A->thread_end[i-1] + 1 ] - ii[ A->thread_start[i-1] ];
-	  if(new_nz < my_nz){
-	    A->thread_start[i] += 1;
-	    A->thread_end[i-1] += 1;
-	    improved = PETSC_TRUE;
+	  if(i > 1){
+	    // Look back
+	    my_nz = ii[ A->thread_end[i] ] - ii[ A->thread_start[i] ];
+	    new_nz = ii[ A->thread_end[i-1] + 1 ] - ii[ A->thread_start[i-1] ];
+	    if(new_nz < my_nz){
+	      A->thread_start[i] += 1;
+	      A->thread_end[i-1] += 1;
+	      improved = PETSC_TRUE;
+	    }
 	  }
 	}
       }
+
+    } else {
+      /* Even block partitioning based on N-1 worker threads */
+      chunk = m / (nthread-1);
+      if (m % (nthread-1) > 0) chunk++;
+
+      A->thread_start[0] = 0;
+      A->thread_end[0] = 0;
+      for (t=1; t<nthread; t++) {
+	A->thread_start[t] = (t-1) * chunk;
+	A->thread_end[t] = A->thread_start[t] + chunk;
+	if (t == nthread-1) A->thread_end[t] = m;
+      }
     }
-
   } else {
     for (i=0; i<nthread; i++) {
       A->thread_start[i] = 0;