Rio Yokota avatar Rio Yokota committed 5583f03

Allreduce for vec3, int. Bugfix parallel init mimic. RCRIT = 1e12 to prevent subdivision. Allreduce dipole in test_charmm.

Comments (0)

Files changed (13)

 CXX	+= -I../include
 LFLAGS	+= -D$(BASIS)
 LFLAGS  += -DEXPANSION=4 # Specifcy expansion order
-LFLAGS	+= -DERROR_OPT # Use error optimized theta
-LFLAGS	+= -DUSE_RMAX # Use Rmax in multipole acceptance criteria
+#LFLAGS	+= -DERROR_OPT # Use error optimized theta
+#LFLAGS	+= -DUSE_RMAX # Use Rmax in multipole acceptance criteria
 LFLAGS	+= -DDUAL # Use dual tree traversal (turn off every option above before turning this off)
 #LFLAGS	+= -DAUTO # Use autotuned selection between P2P or M2L (needs improvement)
 #LFLAGS	+= -DFP64 # Use double precision

examples/ewald.cxx

   }
   pass.downwardPass(cells);
   vec3 localDipole = pass.getDipole(bodies,0);
-  vec3 globalDipole = LET.allreduce(localDipole);
-  int numBodies = LET.allreduce(int(bodies.size()));
+  vec3 globalDipole = LET.allreduceVec3(localDipole);
+  int numBodies = LET.allreduceInt(bodies.size());
   pass.dipoleCorrection(bodies,globalDipole,numBodies,cycle);
   logger.stopPAPI();
   logger.stopTimer("Total FMM");
 
  public:
   Args(int argc=0, char ** argv=NULL) : numBodies(1000000), numTargets(100), ncrit(16), nspawn(1000), images(0),
-    theta(.35), mutual(1), verbose(1), distribution("cube") {
+    theta(.4), mutual(1), verbose(1), distribution("cube") {
     while (1) {
       int option_index;
       int c = getopt_long(argc, argv, "", long_options, &option_index);

include/dataset.h

     srand48(seed);                                              // Set seed for random number generator
     Bodies bodies(numBodies);                                   // Initialize bodies
     for (B_iter B=bodies.begin(); B!=bodies.end(); B++) {       // Loop over bodies
-      if (numSplit != 1 && B-bodies.begin() == int(seed*bodies.size()/numSplit)) {// Mimic parallel dataset
+      if (numSplit != 1 && B-bodies.begin() == int((seed+1)*bodies.size()/numSplit)) {// Mimic parallel dataset
         seed++;                                                 //   Mimic seed at next rank
         srand48(seed);                                          //   Set seed for random number generator
       }                                                         //  Endif for mimicing parallel dataset
     srand48(seed);                                              // Set seed for random number generator
     Bodies bodies(numBodies);                                   // Initialize bodies
     for (B_iter B=bodies.begin(); B!=bodies.end(); B++) {       // Loop over bodies
-      if (numSplit != 1 && B-bodies.begin() == int(seed*bodies.size()/numSplit)) {// Mimic parallel dataset
+      if (numSplit != 1 && B-bodies.begin() == int((seed+1)*bodies.size()/numSplit)) {// Mimic parallel dataset
         seed++;                                                 //   Mimic seed at next rank
         srand48(seed);                                          //   Set seed for random number generator
       }                                                         //  Endif for mimicing parallel dataset
     srand48(seed);                                              // Set seed for random number generator
     Bodies bodies(numBodies);                                   // Initialize bodies
     for (B_iter B=bodies.begin(); B!=bodies.end(); B++) {       // Loop over bodies
-      if (numSplit != 1 && B-bodies.begin() == int(seed*bodies.size()/numSplit)) { // Mimic parallel dataset
+      if (numSplit != 1 && B-bodies.begin() == int((seed+1)*bodies.size()/numSplit)) { // Mimic parallel dataset
         seed++;                                                 //   Mimic seed at next rank
         srand48(seed);                                          //   Set seed for random number generator
       }                                                         //  Endif for mimicing parallel dataset
       for (C_iter CC=C0+Cj->CHILD; CC!=C0+Cj->CHILD+Cj->NCHILD; CC++) {// Loop over cell's children
         traverse(Ci,CC,C0,Xperiodic);                           //   Recursively call traverse
       }                                                         //  End loop over cell's children
-    }                                                           // End if for twig cells
+    }                                                           // End if for far cells
   }
 
 //! Find neighbor cells

include/localessentialtree.h

     int iparent = 0;                                            // Parent send cell's offset
     int level = int(logf(mpisize-1) / M_LN2 / 3) + 1;           // Level of local root cell
     if (mpisize == 1) level = 0;                                // Account for serial case
+    if (C0->NCHILD == 0) {                                      // If root cell is leaf
+      addSendBody(C0, ibody, icell);                            //  Add bodies to send
+    }                                                           // End if for root cell leaf
     while (!cellQueue.empty()) {                                // While traversal queue is not empty
       C_iter C = cellQueue.front();                             //  Get front item in traversal queue
       cellQueue.pop();                                          //  Pop item from traversal queue
       for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
         addSendCell(CC, iparent, icell);                        //   Add cells to send
-        if (CC->NCHILD == 0) {                                  //   If cell is twig
+        if (CC->NCHILD == 0) {                                  //   If cell is leaf
           addSendBody(CC, ibody, icell);                        //    Add bodies to send
-        } else {                                                //   If cell is not twig
+        } else {                                                //   If cell is not leaf
           bool divide = false;                                  //    Initialize logical for dividing
           vec3 Xperiodic = 0;                                   //    Periodic coordinate offset
           if (images == 0) {                                    //    If free boundary condition
           if (!divide) {                                        //    If cell does not have to be divided
             CC->NCHILD = 0;                                     //     Cut off child links
           }                                                     //    Endif for cell division
-        }                                                       //   Endif for twig
+        }                                                       //   Endif for leaf
         cellQueue.push(CC);                                     //   Push cell to traveral queue
       }                                                         //  End loop over child cells
       iparent++;                                                //  Increment parent send cell's offset
     if (remainder > iSplit) end++;                              // Adjust the end counter for remainder
   }
 
-  //! Reduce values
-  template<typename T>
-  T allreduce(T send) {
-    int word = sizeof(send) / 4;                                // Word size of value
-    T recv;                                                     // Receive buffer
-    MPI_Allreduce(&send, &recv, word, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);// Communicate values
-    return recv;                                                // Return received values
-  }
-
 //! Print a scalar value on all ranks
   template<typename T>
   void print(T data) {

include/partition.h

     return recvBodies;                                          // Return bodies
   }
 
+//! Allreduce int from all ranks
+  int allreduceInt(int send) {
+    int recv;                                                   // Receive buffer
+    MPI_Allreduce(&send, &recv, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);// Communicate values
+    return recv;                                                // Return received values
+  }
+
+//! Allreduce fvec3 from all ranks
+  fvec3 allreduceVec3(fvec3 send) {
+    fvec3 recv;                                                 // Receive buffer
+    MPI_Allreduce(send, recv, 3, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);// Communicate values
+    return recv;                                                // Return received values
+  }
+
 //! Allreduce bounds from all ranks
   Bounds allreduceBounds(Bounds local) {
     fvec3 localXmin, localXmax, globalXmin, globalXmax;

include/updownpass.h

       setRcrit(C0, C0, c);                                      //  Error optimization of Rcrit
       if( cells.size() > 9 ) {                                  //  If tree has more than 2 levels
         for (C_iter C=C0; C!=C0+9; C++) {                       //   Loop over top 2 levels of cells
-          C->RCRIT *= 10;                                       //    Prevent approximation
+          C->RCRIT = 1e12;                                      //    Prevent approximation
         }                                                       //   End loop over top 2 levels of cells
       }                                                         //  End if for tree levels
     }                                                           // End if for empty cell vector

wrappers/Makefile

 	make libcharmm.a
 	make libewald.a
 	$(FC) test_charmm.f90 -L. -lfmm -ltbb -lewald -lstdc++ -lmpi_cxx
-	./a.out
+	mpirun -np 2 ./a.out
 
 libgromacs.a: gromacs.o ../kernels/Laplace$(BASIS)$(DEVICE).o ../kernels/LaplaceP2P$(DEVICE).o
 	ar ruv libgromacs.a $?

wrappers/charmm.cxx

 extern "C" void fmm_(int * n, double * x, double * q, double * p, double * f, double * cycle) {
   args->numBodies = *n;
   logger->printTitle("FMM Parameters");
-  args->print(logger->stringLength,P);
+  args->print(logger->stringLength, P, LET->mpirank);
 #if _OPENMP
 #pragma omp parallel
 #pragma omp master
   }
   pass->downwardPass(cells);
   vec3 localDipole = pass->getDipole(bodies,0);
-  vec3 globalDipole = LET->allreduce(localDipole);
-  int numBodies = LET->allreduce(bodies.size());
+  vec3 globalDipole = LET->allreduceVec3(localDipole);
+  int numBodies = LET->allreduceInt(bodies.size());
   pass->dipoleCorrection(bodies, globalDipole, numBodies, *cycle);
 
   /*

wrappers/gromacs.cxx

     LET.verbose = true;
   }
   logger.printTitle("FMM Parameters");
-  args.print(logger.stringLength,P);
+  args.print(logger.stringLength, P, LET.mpirank);
 #if AUTO
   traversal.timeKernels();
 #endif
   }
   pass.downwardPass(cells);
   vec3 localDipole = pass.getDipole(bodies,0);
-  vec3 globalDipole = LET.allreduce(localDipole);
-  int numBodies = LET.allreduce(bodies.size());
+  vec3 globalDipole = LET.allreduceVec3(localDipole);
+  int numBodies = LET.allreduceInt(bodies.size());
   pass.dipoleCorrection(bodies,globalDipole,numBodies,cycle);
 
   LET.unpartition(bodies);

wrappers/test_charmm.f90

       isend = mod(mpirank + 1,           mpisize)
       irecv = mod(mpirank - 1 + mpisize, mpisize)
 
-      call mpi_isend(nold, 1, mpi_int, irecv, 0, mpi_comm_world, ireqs, ierr)
-      call mpi_irecv(nnew, 1, mpi_int, isend, 0, mpi_comm_world, ireqr, ierr)
+      call mpi_isend(nold, 1, mpi_integer, irecv, 0, mpi_comm_world, ireqs, ierr)
+      call mpi_irecv(nnew, 1, mpi_integer, isend, 0, mpi_comm_world, ireqr, ierr)
       call mpi_wait(ireqs, istatus, ierr)
       call mpi_wait(ireqr, istatus, ierr)
 
       type(c_ptr) ctx
       integer, dimension (128) :: iseed
       real(8), dimension (3) :: dipole = (/0, 0, 0/)
+      real(8), dimension (3) :: dipole2
       real(8), dimension (3) :: xperiodic
       allocatable :: x(:)
       allocatable :: q(:)
       call mpi_init(ierr)
       call mpi_comm_size(mpi_comm_world, mpisize, ierr);
       call mpi_comm_rank(mpi_comm_world, mpirank, ierr);
-      n = 1000;
+      ni = 2;
       images = 0
       ksize = 11
       pcycle = 2 * pi
       call random_number(x)
       call random_number(q)
       average = 0
-      do i = 1, n
+      do i = 1, ni
         x(3*i-2) = x(3*i-2) * pcycle - pcycle / 2
         x(3*i-1) = x(3*i-1) * pcycle - pcycle / 2
         x(3*i-0) = x(3*i-0) * pcycle - pcycle / 2
         f(3*i-0) = 0
         average = average + q(i)
       end do
-      average = average / n
+      average = average / ni
       do i = 1, n
         q(i) = q(i) - average
       end do
       call fmm_init(images)
-      call fmm_partition(n, x, q, pcycle)
-      call fmm(n, x, q, p, f, pcycle)
-      do i = 1, n
+      call fmm_partition(ni, x, q, pcycle)
+      call fmm(ni, x, q, p, f, pcycle)
+      do i = 1, ni
         x2(3*i-2) = x(3*i-2)
         x2(3*i-1) = x(3*i-1)
         x2(3*i-0) = x(3*i-0)
         f2(3*i-0) = 0
       end do
 #if 0
-      call fmm_ewald(n, x2, q2, p2, f2, ksize, alpha, pcycle)
+      call fmm_ewald(ni, x2, q2, p2, f2, ksize, alpha, pcycle)
 #else
       prange = 0
       do i = 0, images-1
         prange = prange + 3**i
       end do
-      do i = 1, N
+      do i = 1, ni
         do d = 1, 3
           dipole(d) = dipole(d) + x(3*i+d-3) * q(i)
         end do
       end do
-      norm = dipole(1) * dipole(1) + dipole(2) * dipole(2) + dipole(3) * dipole(3)
+      call mpi_allreduce(dipole, dipole2, 3, mpi_real8, mpi_sum, mpi_comm_world, ierr)
+      norm = dipole2(1) * dipole2(1) + dipole2(2) * dipole2(2) + dipole2(3) * dipole2(3)
       coef = 4 * pi / (3 * pcycle * pcycle * pcycle)
+      nj = ni
       if(mpirank.eq.0) print"(a)",'--- MPI direct sum ---------------'
       do irank = 0, mpisize-1
         if (mpirank.eq.0) print"(a,i1,a,i1)",'Direct loop          : ',irank+1,'/',mpisize
-        call mpi_shift(x2, 3*n, mpisize, mpirank)
-        call mpi_shift(q2, n,   mpisize, mpirank)
-        do i = 1, n
+        call mpi_shift(x2, 3*nj, mpisize, mpirank)
+        call mpi_shift(q2, nj,   mpisize, mpirank)
+        do i = 1, ni
           pp = 0
           fx = 0
           fy = 0
                 xperiodic(1) = ix * pcycle
                 xperiodic(2) = iy * pcycle
                 xperiodic(3) = iz * pcycle
-                do j = 1, n
+                do j = 1, nj
                   dx = x(3*i-2) - x2(3*j-2) - xperiodic(1)
                   dy = x(3*i-1) - x2(3*j-1) - xperiodic(2)
                   dz = x(3*i-0) - x2(3*j-0) - xperiodic(3)
               end do
             end do
           end do
-          p2(i) = p2(i) + pp - coef * norm / n / q2(i)
-          f2(3*i-2) = f2(3*i-2) - fx - coef * dipole(1)
-          f2(3*i-1) = f2(3*i-1) - fy - coef * dipole(2)
-          f2(3*i-0) = f2(3*i-0) - fz - coef * dipole(3)
+          p2(i) = p2(i) + pp - coef * norm / n / q(i)
+          f2(3*i-2) = f2(3*i-2) - fx - coef * dipole2(1)
+          f2(3*i-1) = f2(3*i-1) - fy - coef * dipole2(2)
+          f2(3*i-0) = f2(3*i-0) - fz - coef * dipole2(3)
         end do
       end do
 #endif
       norm4 = 0
       v = 0
       v2 = 0
-      do i = 1, n
+      do i = 1, ni
         v = v + p(i) * q(i)
-        v2 = v2 + p2(i) * q2(i)
+        v2 = v2 + p2(i) * q(i)
         diff2 = diff2 + (f(3*i-2) - f2(3*i-2)) * (f(3*i-2) - f2(3*i-2))&
                       + (f(3*i-1) - f2(3*i-1)) * (f(3*i-1) - f2(3*i-1))&
                       + (f(3*i-0) - f2(3*i-0)) * (f(3*i-0) - f2(3*i-0))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.