Commits

Rio Yokota committed 1bc65d2

Forgot to add these.

Comments (0)

Files changed (12)

include/dataset.h

+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef dataset_h
+#define dataset_h
+#include "kernel.h"
+
+//! Contains all the different datasets
+template<Equation equation>
+class Dataset : public Kernel<equation> {
+private:
+  long filePosition;                                            //!< Position of file stream
+
+public:
+  using Kernel<equation>::stringLength;                         //!< Max length of event name
+
+//! Constructor
+  Dataset() : filePosition(0) {}
+//! Destructor
+  ~Dataset() {}
+
+//! Initialize source values
+  void initSource(Bodies &bodies) {
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      B->IBODY = B-bodies.begin();                              //  Tag body with initial index
+      B->IPROC = MPIRANK;                                       //  Tag body with initial MPI rank
+//      B->SRC = (drand48() - .5) / bodies.size() / MPISIZE;      //  Initialize mass/charge
+      B->SRC = 1. / bodies.size() / MPISIZE;                    //  Initialize mass/charge
+    }                                                           // End loop over bodies
+  }
+
+//! Initialize target values
+  void initTarget(Bodies &bodies, bool IeqJ=true) {
+    srand48(0);                                                 // Set seed for random number generator
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      B->IBODY = B-bodies.begin();                              //  Tag body with initial index
+      B->IPROC = MPIRANK;                                       //  Tag body with initial MPI rank
+      B->TRG = 0;                                               //  Clear previous target values (IeqJ is dummy)
+      if( EPS2 != 0 ) B->TRG[0] = -B->SRC / std::sqrt(EPS2) * IeqJ;//  Initialize potential (0 if I != J)
+    }                                                           // End loop over bodies
+  }
+
+//! Read target values from file
+  void readTarget(Bodies &bodies) {
+    char fname[256];                                            // File name for saving direct calculation values
+    sprintf(fname,"direct%4.4d",MPIRANK);                       // Set file name
+    std::ifstream file(fname,std::ios::in | std::ios::binary);  // Open file
+    file.seekg(filePosition);                                   // Set position in file
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      file >> B->TRG[0];                                        //  Read data for potential
+      file >> B->TRG[1];                                        //  Read data for x acceleration
+      file >> B->TRG[2];                                        //  Read data for y acceleration
+      file >> B->TRG[3];                                        //  Read data for z acceleration
+    }                                                           // End loop over bodies
+    filePosition = file.tellg();                                // Get position in file
+    file.close();                                               // Close file
+  }
+
+//! Write target values to file
+  void writeTarget(Bodies &bodies) {
+    char fname[256];                                            // File name for saving direct calculation values
+    sprintf(fname,"direct%4.4d",MPIRANK);                       // Set file name
+    std::ofstream file(fname,std::ios::out | std::ios::app | std::ios::binary);// Open file
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      file << B->TRG[0] << std::endl;                           //  Write data for potential
+      file << B->TRG[1] << std::endl;                           //  Write data for x acceleration
+      file << B->TRG[2] << std::endl;                           //  Write data for y acceleration
+      file << B->TRG[3] << std::endl;                           //  Write data for z acceleration
+    }                                                           // End loop over bodies
+    file.close();                                               // Close file
+  }
+
+//! Evaluate relative L2 norm error
+  void evalError(Bodies &bodies, Bodies &bodies2,
+                 real &diff1, real &norm1, real &diff2, real &norm2, bool ewald=false) {
+    real p = 0, p2 = 0;                                         // Total energy
+    B_iter B2 = bodies2.begin();                                // Set iterator for bodies2
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B, ++B2 ) {// Loop over bodies & bodies2
+#ifdef DEBUG
+      std::cout << B->ICELL << " " << B->TRG[0] << " " << B2->TRG[0] << std::endl;// Compare every element
+#endif
+      if( ewald ) {                                             // If ewald method is used
+        p += B->TRG[0] * B->SRC;                                //  total energy
+        p2 += B2->TRG[0] * B2->SRC;                             //  total energy
+      } else {                                                  // If normal Laplace kernel is used
+        diff1 += (B->TRG[0] - B2->TRG[0]) * (B->TRG[0] - B2->TRG[0]);// Difference of potential
+        norm1 += B2->TRG[0] * B2->TRG[0];                       //  Value of potential
+      }                                                         // End if for Ewald method
+      diff2 += (B->TRG[1] - B2->TRG[1]) * (B->TRG[1] - B2->TRG[1]);// Difference of x acceleration
+      diff2 += (B->TRG[2] - B2->TRG[2]) * (B->TRG[2] - B2->TRG[2]);// Difference of y acceleration
+      diff2 += (B->TRG[3] - B2->TRG[3]) * (B->TRG[3] - B2->TRG[3]);// Difference of z acceleration
+      norm2 += B2->TRG[1] * B2->TRG[1];                         //  Value of x acceleration
+      norm2 += B2->TRG[2] * B2->TRG[2];                         //  Value of y acceleration
+      norm2 += B2->TRG[3] * B2->TRG[3];                         //  Value of z acceleration
+    }                                                           //  End loop over bodies & bodies2
+    if( ewald ) {                                               // If ewald method is used
+      diff1 = (p - p2) * (p - p2);                              //  Difference of total energy
+      norm1 = p2 * p2;                                          //  Value of total energy
+    }                                                           // End if for Ewald method
+  }
+
+//! Print relative L2 norm error
+  void printError(real diff1, real norm1, real diff2, real norm2) {
+    std::cout << std::setw(stringLength) << std::left
+              << "Error (pot)" << " : " << std::sqrt(diff1/norm1) << std::endl;
+    std::cout << std::setw(stringLength) << std::left
+              << "Error (acc)" << " : " << std::sqrt(diff2/norm2) << std::endl;
+  }
+};
+
+template<>
+class Dataset<VanDerWaals> : public Kernel<VanDerWaals> {
+private:
+  long filePosition;                                            //!< Position of file stream
+
+public:
+//! Constructor
+  Dataset() : filePosition(0) {}
+//! Destructor
+  ~Dataset() {}
+
+//! Initialize source values
+  void initSource(Bodies &bodies) {
+    THETA = .1;                                                 // Force opening angle to be small
+    ATOMS = 16;                                                 // Set number of atoms
+    RSCALE.resize(ATOMS*ATOMS);                                 // Resize rscale vector
+    GSCALE.resize(ATOMS*ATOMS);                                 // Resize gscale vector
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      B->IBODY = B-bodies.begin();                              //  Tag body with initial index
+      B->IPROC = MPIRANK;                                       //  Tag body with initial MPI rank
+      B->SRC = drand48() * ATOMS;                               //  Initialize mass/charge
+    }                                                           // End loop over bodies
+    for( int i=0; i!=ATOMS; ++i ) {                             // Loop over atoms
+      GSCALE[i*ATOMS+i] = drand48();                            //  Set VanDerWaals post scaling factor
+      RSCALE[i*ATOMS+i] = drand48();                            //  Set VanDerWaals pre scaling factor
+    }                                                           // End loop over atoms
+    for( int i=0; i!=ATOMS; ++i ) {                             // Loop over target atoms
+      for( int j=0; j!=ATOMS; ++j ) {                           //  Loop over source atoms
+        if( i != j ) {                                          //   If target and source are different
+          GSCALE[i*ATOMS+j] = std::sqrt(GSCALE[i*ATOMS+i] * GSCALE[j*ATOMS+j]);// Set post scaling factor
+          RSCALE[i*ATOMS+j] = (std::sqrt(RSCALE[i*ATOMS+i]) + std::sqrt(RSCALE[j*ATOMS+j])) * 0.5;
+          RSCALE[i*ATOMS+j] *= RSCALE[i*ATOMS+j];               //    Set pre scaling factor
+        }                                                       //   End if for different target and source
+      }                                                         //  End loop over source atoms
+    }                                                           // End loop over target atoms
+  }
+
+//! Initialize target values
+  void initTarget(Bodies &bodies, bool IeqJ=true) {
+    srand48(0);                                                 // Set seed for random number generator
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      B->IBODY = B-bodies.begin();                              //  Tag body with initial index
+      B->IPROC = MPIRANK;                                       //  Tag body with initial MPI rank
+      B->TRG = 0 * IeqJ;                                        //  Clear previous target values
+    }                                                           // End loop over bodies
+  }
+
+//! Read target values from file
+  void readTarget(Bodies &bodies) {
+    char fname[256];                                            // File name for saving direct calculation values
+    sprintf(fname,"direct%4.4d",MPIRANK);                       // Set file name
+    std::ifstream file(fname,std::ios::in | std::ios::binary);  // Open file
+    file.seekg(filePosition);                                   // Set position in file
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      file >> B->TRG[0];                                        //  Read data for potential
+      file >> B->TRG[1];                                        //  Read data for x acceleration
+      file >> B->TRG[2];                                        //  Read data for y acceleration
+      file >> B->TRG[3];                                        //  Read data for z acceleration
+    }                                                           // End loop over bodies
+    filePosition = file.tellg();                                // Get position in file
+    file.close();                                               // Close file
+  }
+
+//! Write target values to file
+  void writeTarget(Bodies &bodies) {
+    char fname[256];                                            // File name for saving direct calculation values
+    sprintf(fname,"direct%4.4d",MPIRANK);                       // Set file name
+    std::ofstream file(fname,std::ios::out | std::ios::app | std::ios::binary);// Open file
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      file << B->TRG[0] << std::endl;                           //  Write data for potential
+      file << B->TRG[1] << std::endl;                           //  Write data for x acceleration
+      file << B->TRG[2] << std::endl;                           //  Write data for y acceleration
+      file << B->TRG[3] << std::endl;                           //  Write data for z acceleration
+    }                                                           // End loop over bodies
+    file.close();                                               // Close file
+  }
+
+//! Evaluate relative L2 norm error
+  void evalError(Bodies &bodies, Bodies &bodies2,
+                 real &diff1, real &norm1, real &diff2, real &norm2) {
+    B_iter B2 = bodies2.begin();                                // Set iterator for bodies2
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B, ++B2 ) {// Loop over bodies & bodies2
+#ifdef DEBUG
+      std::cout << B->ICELL << " " << B->TRG[0] << " " << B2->TRG[0] << std::endl;// Compare every element
+#endif
+      diff1 += (B->TRG[0] - B2->TRG[0]) * (B->TRG[0] - B2->TRG[0]);// Difference of potential
+      norm1 += B2->TRG[0] * B2->TRG[0];                         //  Value of potential
+      diff2 += (B->TRG[1] - B2->TRG[1]) * (B->TRG[1] - B2->TRG[1]);// Difference of x acceleration
+      diff2 += (B->TRG[2] - B2->TRG[2]) * (B->TRG[2] - B2->TRG[2]);// Difference of y acceleration
+      diff2 += (B->TRG[3] - B2->TRG[3]) * (B->TRG[3] - B2->TRG[3]);// Difference of z acceleration
+      norm2 += B2->TRG[1] * B2->TRG[1];                         //  Value of x acceleration
+      norm2 += B2->TRG[2] * B2->TRG[2];                         //  Value of y acceleration
+      norm2 += B2->TRG[3] * B2->TRG[3];                         //  Value of z acceleration
+    }                                                           // End loop over bodies & bodies2
+  }
+
+//! Print relative L2 norm error
+  void printError(real diff1, real norm1, real diff2, real norm2) {
+    std::cout << std::setw(stringLength) << std::left
+              << "Error (pot)" << " : " << std::sqrt(diff1/norm1) << std::endl;
+    std::cout << std::setw(stringLength) << std::left
+              << "Error (acc)" << " : " << std::sqrt(diff2/norm2) << std::endl;
+  }
+};
+
+#endif

include/evaluator.h

+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef evaluator_h
+#define evaluator_h
+#include "dataset.h"
+#define splitFirst(Ci,Cj) Cj->NCHILD == 0 || (Ci->NCHILD != 0 && Ci->R > Cj->R)
+
+//! Interface between tree and kernel
+template<Equation equation>
+class Evaluator : public Dataset<equation> {
+private:
+  real        timeM2L;                                          //!< M2L execution time
+  real        timeM2P;                                          //!< M2P execution time
+  real        timeP2P;                                          //!< P2P execution time
+
+protected:
+  C_iter      CiB;                                              //!< icells begin per call
+  C_iter      CiE;                                              //!< icells end per call
+  Lists       listM2L;                                          //!< M2L interaction list
+  Lists       listM2P;                                          //!< M2P interaction list
+  Lists       listP2P;                                          //!< P2P interaction list
+
+  int         Iperiodic;                                        //!< Periodic image flag (using each bit for images)
+  const int   Icenter;                                          //!< Periodic image flag at center
+  Maps        flagM2L;                                          //!< Existance of periodic image for M2L
+  Maps        flagM2P;                                          //!< Existance of periodic image for M2P
+  Maps        flagP2P;                                          //!< Existance of periodic image for P2P
+
+  real        NP2P;                                             //!< Number of P2P kernel calls
+  real        NM2P;                                             //!< Number of M2P kernel calls
+  real        NM2L;                                             //!< Number of M2L kernel calls
+
+public:
+  using Kernel<equation>::startTimer;                           //!< Start timer for given event
+  using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
+  using Kernel<equation>::writeTrace;                           //!< Write traces of all events
+  using Kernel<equation>::R0;                                   //!< Radius of root cell
+  using Kernel<equation>::Ci0;                                  //!< Begin iterator for target cells
+  using Kernel<equation>::Cj0;                                  //!< Begin iterator for source cells
+  using Kernel<equation>::ALPHA;                                //!< Scaling parameter for Ewald summation
+  using Kernel<equation>::keysHost;                             //!< Offsets for rangeHost
+  using Kernel<equation>::rangeHost;                            //!< Offsets for sourceHost
+  using Kernel<equation>::constHost;                            //!< Constants on host
+  using Kernel<equation>::sourceHost;                           //!< Sources on host
+  using Kernel<equation>::targetHost;                           //!< Targets on host
+  using Kernel<equation>::sourceBegin;                          //!< Define map for offset of source cells
+  using Kernel<equation>::sourceSize;                           //!< Define map for size of source cells
+  using Kernel<equation>::targetBegin;                          //!< Define map for offset of target cells
+  using Kernel<equation>::allocate;                             //!< Allocate GPU kernels
+  using Kernel<equation>::hostToDevice;                         //!< Copy from host to device
+  using Kernel<equation>::deviceToHost;                         //!< Copy from device to host
+  using Kernel<equation>::P2M;                                  //!< Evaluate P2M kernel
+  using Kernel<equation>::M2M;                                  //!< Evaluate M2M kernel
+  using Kernel<equation>::M2L;                                  //!< Evaluate M2L kernel
+  using Kernel<equation>::M2P;                                  //!< Evaluate M2P kernel
+  using Kernel<equation>::P2P;                                  //!< Evaluate P2P kernel
+  using Kernel<equation>::L2L;                                  //!< Evaluate L2L kernel
+  using Kernel<equation>::L2P;                                  //!< Evaluate L2P kernel
+  using Kernel<equation>::EwaldReal;                            //!< Evaluate Ewald real part
+  using Dataset<equation>::initSource;                          //!< Initialize source values
+  using Dataset<equation>::initTarget;                          //!< Initialize target values
+
+private:
+//! Approximate interaction between two cells
+  inline void approximate(C_iter Ci, C_iter Cj) {
+#if HYBRID
+    if( timeP2P*Cj->NDLEAF < timeM2P && timeP2P*Ci->NDLEAF*Cj->NDLEAF < timeM2L) {// If P2P is fastest
+      evalP2P(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
+    } else if ( timeM2P < timeP2P*Cj->NDLEAF && timeM2P*Ci->NDLEAF < timeM2L ) {// If M2P is fastest
+      evalM2P(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
+    } else {                                                    // If M2L is fastest
+      evalM2L(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
+    }                                                           // End if for kernel selection
+#elif TREECODE
+    evalM2P(Ci,Cj);                                             // Evaluate on CPU, queue on GPU
+#else
+    evalM2L(Ci,Cj);                                             // Evalaute on CPU, queue on GPU
+#endif
+  }
+
+#if QUARK
+  inline void interact(C_iter Ci, C_iter Cj, Quark *quark);     //!< interact() function using QUARK
+#endif
+
+//! Traverse a single tree using a stack
+  void traverseStack(C_iter Ci, C_iter C) {
+    CellStack cellStack;                                        // Stack of cells
+    cellStack.push(C);                                          // Push pair to queue
+    while( !cellStack.empty() ) {                               // While traversal stack is not empty
+      C = cellStack.top();                                      //  Get cell from top of stack
+      cellStack.pop();                                          //  Pop traversal stack
+      for( C_iter Cj=Cj0+C->CHILD; Cj!=Cj0+C->CHILD+C->NCHILD; ++Cj ) {// Loop over cell's children
+        vect dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
+        real Rq = std::sqrt(norm(dX));                          //   Scalar distance
+        if( Rq * THETA < Ci->R + Cj->R && Cj->NCHILD == 0 ) {   //   If twigs are close
+          evalEwaldReal(Ci,Cj);                                 //    Ewald real part
+        } else if( Cj->NCHILD != 0 ) {                          //   If cells are not twigs
+          cellStack.push(Cj);                                   //    Push source cell to stack
+        }                                                       //   End if for twig cells
+      }                                                         //  End loop over cell's children
+    }                                                           // End while loop for traversal stack
+  }
+
+//! Traverse a pair of trees using a queue
+  void traverseQueue(Pair pair) {
+    PairQueue pairQueue;                                        // Queue of interacting cell pairs
+#if QUARK
+    Quark *quark = QUARK_New(4);                                // Initialize QUARK object
+    C_iter root = pair.first;                                   // Iterator for root target cell
+#endif
+    pairQueue.push_back(pair);                                  // Push pair to queue
+    while( !pairQueue.empty() ) {                               // While dual traversal queue is not empty
+      pair = pairQueue.front();                                 //  Get interaction pair from front of queue
+      pairQueue.pop_front();                                    //  Pop dual traversal queue
+      if(splitFirst(pair.first,pair.second)) {                  //  If first cell is larger
+        C_iter C = pair.first;                                  //   Split the first cell
+        for( C_iter Ci=Ci0+C->CHILD; Ci!=Ci0+C->CHILD+C->NCHILD; ++Ci ) {// Loop over first cell's children
+          interact(Ci,pair.second,pairQueue);                   //    Calculate interaction between cells
+        }                                                       //   End loop over fist cell's children
+      } else {                                                  //  Else if second cell is larger
+        C_iter C = pair.second;                                 //   Split the second cell
+        for( C_iter Cj=Cj0+C->CHILD; Cj!=Cj0+C->CHILD+C->NCHILD; ++Cj ) {// Loop over second cell's children
+          interact(pair.first,Cj,pairQueue);                    //    Calculate interaction betwen cells
+        }                                                       //   End loop over second cell's children
+      }                                                         //  End if for which cell to split
+#if QUARK
+      if( int(pairQueue.size()) > root->NDLEAF / 100 ) {        //  When queue size reaches threshold
+        while( !pairQueue.empty() ) {                           //   While dual traversal queue is not empty
+          pair = pairQueue.front();                             //    Get interaction pair from front of queue
+          pairQueue.pop_front();                                //    Pop dual traversal queue
+          interact(pair.first,pair.second,quark);               //    Schedule interact() task on QUARK
+        }                                                       //   End while loop for dual traversal queue
+      }                                                         //  End if for queue size
+#endif
+    }                                                           // End while loop for dual traversal queue
+#if QUARK
+    QUARK_Delete(quark);                                        // Delete QUARK object 
+    writeTrace();                                               // Write event trace to file
+#endif
+  }
+
+//! Get range of periodic images
+  int getPeriodicRange() {
+    int prange = 0;                                             //  Range of periodic images
+    for( int i=0; i!=IMAGES; ++i ) {                            //  Loop over periodic image sublevels
+      prange += int(pow(3,i));                                  //   Accumulate range of periodic images
+    }                                                           //  End loop over perioidc image sublevels
+    return prange;                                              // Return range of periodic images
+  }
+
+protected:
+//! Get level from cell index
+  int getLevel(bigint index) {
+    int i = index;                                              // Copy to dummy index
+    int level = -1;                                             // Initialize level counter
+    while( i >= 0 ) {                                           // While cell index is non-negative
+      level++;                                                  //  Increment level
+      i -= 1 << 3*level;                                        //  Subtract number of cells in that level
+    }                                                           // End while loop for cell index
+    return level;                                               // Return the level
+  }
+
+  void timeKernels();                                           //!< Time all kernels for auto-tuning
+
+//! Upward phase for periodic cells
+  void upwardPeriodic(Cells &jcells) {
+    Cells pccells, pjcells;                                     // Periodic center cell and jcell
+    pccells.push_back(jcells.back());                           // Root cell is first periodic cell
+    for( int level=0; level<IMAGES-1; ++level ) {               // Loop over sublevels of tree
+      Cell cell;                                                //  New periodic cell at next sublevel
+      C_iter C = pccells.end() - 1;                             //  Set previous periodic center cell as source
+      for( int ix=-1; ix<=1; ++ix ) {                           //  Loop over x periodic direction
+        for( int iy=-1; iy<=1; ++iy ) {                         //   Loop over y periodic direction
+          for( int iz=-1; iz<=1; ++iz ) {                       //    Loop over z periodic direction
+            if( ix != 0 || iy != 0 || iz != 0 ) {               //     If periodic cell is not at center
+              for( int cx=-1; cx<=1; ++cx ) {                   //      Loop over x periodic direction (child)
+                for( int cy=-1; cy<=1; ++cy ) {                 //       Loop over y periodic direction (child)
+                  for( int cz=-1; cz<=1; ++cz ) {               //        Loop over z periodic direction (child)
+                    cell.X[0]  = C->X[0] + (ix * 6 + cx * 2) * C->R;//     Set new x coordinate for periodic image
+                    cell.X[1]  = C->X[1] + (iy * 6 + cy * 2) * C->R;//     Set new y cooridnate for periodic image
+                    cell.X[2]  = C->X[2] + (iz * 6 + cz * 2) * C->R;//     Set new z coordinate for periodic image
+                    cell.M     = C->M;                          //         Copy multipoles to new periodic image
+                    cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0;//         Initialize NCLEAF, NDLEAF, & NCHILD
+                    jcells.push_back(cell);                     //         Push cell into periodic jcell vector
+                  }                                             //        End loop over z periodic direction (child)
+                }                                               //       End loop over y periodic direction (child)
+              }                                                 //      End loop over x periodic direction (child)
+            }                                                   //     Endif for periodic center cell
+          }                                                     //    End loop over z periodic direction
+        }                                                       //   End loop over y periodic direction
+      }                                                         //  End loop over x periodic direction
+      for( int ix=-1; ix<=1; ++ix ) {                           //  Loop over x periodic direction
+        for( int iy=-1; iy<=1; ++iy ) {                         //   Loop over y periodic direction
+          for( int iz=-1; iz<=1; ++iz ) {                       //    Loop over z periodic direction
+            cell.X[0] = C->X[0] + ix * 2 * C->R;                //     Set new x coordinate for periodic image
+            cell.X[1] = C->X[1] + iy * 2 * C->R;                //     Set new y cooridnate for periodic image
+            cell.X[2] = C->X[2] + iz * 2 * C->R;                //     Set new z coordinate for periodic image
+            cell.M = C->M;                                      //     Copy multipoles to new periodic image
+            pjcells.push_back(cell);                            //     Push cell into periodic jcell vector
+          }                                                     //    End loop over z periodic direction
+        }                                                       //   End loop over y periodic direction
+      }                                                         //  End loop over x periodic direction
+      cell.X = C->X;                                            //  This is the center cell
+      cell.R = 3 * C->R;                                        //  The cell size increases three times
+      pccells.pop_back();                                       //  Pop periodic center cell from vector
+      pccells.push_back(cell);                                  //  Push cell into periodic cell vector
+      C_iter Ci = pccells.end() - 1;                            //  Set current cell as target for M2M
+      Ci->CHILD = 0;                                            //  Set child cells for periodic M2M
+      Ci->NCHILD = 27;                                          //  Set number of child cells for periodic M2M
+      evalM2M(pccells,pjcells);                                 // Evaluate periodic M2M kernels for this sublevel
+      pjcells.clear();                                          // Clear periodic jcell vector
+    }                                                           // End loop over sublevels of tree
+  }
+
+//! Traverse tree for periodic cells
+  void traversePeriodic(Cells &cells, Cells &jcells) {          
+    Xperiodic = 0;                                              // Set periodic coordinate offset
+    Iperiodic = Icenter;                                        // Set periodic flag to center
+    C_iter Cj = jcells.end()-1;                                 // Initialize iterator for periodic source cell
+    for( int level=0; level<IMAGES-1; ++level ) {               // Loop over sublevels of tree
+      for( int I=0; I!=26*27; ++I, --Cj ) {                     //  Loop over periodic images (exclude center)
+#if TREECODE
+        for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) { //   Loop over cells
+          if( Ci->NCHILD == 0 ) {                               //    If cell is twig
+            evalM2P(Ci,Cj);                                     //     Perform M2P kernel
+          }                                                     //    Endif for twig
+        }                                                       //   End loop over cells
+#else
+        C_iter Ci = cells.end() - 1;                            //   Set root cell as target iterator
+        evalM2L(Ci,Cj);                                         //   Perform M2P kernel
+#endif
+      }                                                         //  End loop over x periodic direction
+    }                                                           // End loop over sublevels of tree
+  }
+
+public:
+//! Constructor
+  Evaluator() : Icenter(1 << 13), NP2P(0), NM2P(0), NM2L(0) {}
+//! Destructor
+  ~Evaluator() {}
+
+//! Random distribution in [-1,1]^3 cube
+  void cube(Bodies &bodies, int seed=0, int numSplit=1) {
+    srand48(seed);                                              // Set seed for random number generator
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      if( numSplit != 1 && B-bodies.begin() == int(seed*bodies.size()/numSplit) ) {// Mimic parallel dataset
+        seed++;                                                 //   Mimic seed at next rank
+        srand48(seed);                                          //   Set seed for random number generator
+      }                                                         //  Endif for mimicing parallel dataset
+      for( int d=0; d!=3; ++d ) {                               //  Loop over dimension
+        B->X[d] = drand48() * 2 * M_PI - M_PI;                  //   Initialize positions
+      }                                                         //  End loop over dimension
+    }                                                           // End loop over bodies
+    initSource(bodies);                                         // Initialize source values
+    initTarget(bodies);                                         // Initialize target values
+  }
+
+//! Random distribution on r = 1 sphere
+  void sphere(Bodies &bodies, int seed=0, int numSplit=1) {
+    srand48(seed);                                              // Set seed for random number generator
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      if( numSplit != 1 && B-bodies.begin() == int(seed*bodies.size()/numSplit) ) {// Mimic parallel dataset
+        seed++;                                                 //   Mimic seed at next rank
+        srand48(seed);                                          //   Set seed for random number generator
+      }                                                         //  Endif for mimicing parallel dataset
+      for( int d=0; d!=3; ++d ) {                               //  Loop over dimension
+        B->X[d] = drand48() * 2 - 1;                            //   Initialize positions
+      }                                                         //  End loop over dimension
+      real r = std::sqrt(norm(B->X));                           //  Distance from center
+      for( int d=0; d!=3; ++d ) {                               //  Loop over dimension
+        B->X[d] /= r * 1.1;                                     //   Normalize positions
+      }                                                         //  End loop over dimension
+    }                                                           // End loop over bodies
+    initSource(bodies);                                         // Initialize source values
+    initTarget(bodies);                                         // Initialize target values
+  }
+
+//! Uniform distribution on [-pi,pi]^3 lattice (for debugging)
+  void lattice(Bodies &bodies) {
+    int nx = int(powf(bodies.size()*MPISIZE,1./3));             // Size of lattice per dimension
+    assert( int(bodies.size()*MPISIZE) == nx*nx*nx );           // Number of bodies must be cube of integer
+    int sign = 1;                                               // Initialize alternating sign
+    B_iter B=bodies.begin();                                    // Initialize body iterator
+    for( int ix=0; ix!=nx; ++ix ) {                             // Loop over x direction
+      for( int iy=0; iy!=nx; ++iy ) {                           //  Loop over y direction
+        for( int iz=0; iz!=nx; ++iz, ++B ) {                    //   Loop over z direction
+          B->X[0] = -M_PI + (2 * M_PI * ix + M_PI) / nx;        //    Set x coordinate
+          B->X[1] = -M_PI + (2 * M_PI * iy + M_PI) / nx;        //    Set y coordinate
+          B->X[2] = -M_PI + (2 * M_PI * iz + M_PI) / nx;        //    Set z coordinate
+          B->SRC = sign;                                        //    Source alternates between -1 and 1
+          B->IBODY = B-bodies.begin();                          //    Tag body with initial index
+          B->IPROC = MPIRANK;                                   //    Tag body with initial MPI rank
+          sign = -sign;                                         //    Alternate sign in x direction
+        }                                                       //   End loop over z direction
+        sign = -sign;                                           //   Alternate sign in y direction
+      }                                                         //  End loop over y direction
+      sign = -sign;                                             //  Alternate sign in z direction
+    }                                                           // End loop over x direction
+    initTarget(bodies);                                         // Initialize target values
+  }
+
+//! Add single list for kernel unit test
+  void addM2L(C_iter Cj) {
+    listM2L.resize(1);                                          // Resize vector of M2L interation lists
+    flagM2L.resize(1);                                          // Resize vector of M2L periodic image flags
+    listM2L[0].push_back(Cj);                                   // Push single cell into list
+    flagM2L[0][Cj] |= Icenter;                                  // Flip bit of periodic image flag
+  }
+
+//! Add single list for kernel unit test
+  void addM2P(C_iter Cj) {
+    listM2P.resize(1);                                          // Resize vector of M2P interation lists
+    flagM2P.resize(1);                                          // Resize vector of M2L periodic image flags
+    listM2P[0].push_back(Cj);                                   // Push single cell into list
+    flagM2P[0][Cj] |= Icenter;                                  // Flip bit of periodic image flag
+  }
+
+//! Create periodic images of bodies
+  Bodies periodicBodies(Bodies &bodies) {
+    Bodies jbodies;                                             // Vector for periodic images of bodies
+    int prange = getPeriodicRange();                            // Get range of periodic images
+    for( int ix=-prange; ix<=prange; ++ix ) {                   // Loop over x periodic direction
+      for( int iy=-prange; iy<=prange; ++iy ) {                 //  Loop over y periodic direction
+        for( int iz=-prange; iz<=prange; ++iz ) {               //   Loop over z periodic direction
+          for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {//    Loop over bodies
+            Body body = *B;                                     //     Copy current body
+            body.X[0] += ix * 2 * R0;                           //     Shift x position
+            body.X[1] += iy * 2 * R0;                           //     Shift y position
+            body.X[2] += iz * 2 * R0;                           //     Shift z position
+            jbodies.push_back(body);                            //     Push shifted body into jbodies
+          }                                                     //    End loop over bodies
+        }                                                       //   End loop over z periodic direction
+      }                                                         //  End loop over y periodic direction
+    }                                                           // End loop over x periodic direction
+    return jbodies;                                             // Return vector for periodic images of bodies
+  }
+
+//! Use multipole acceptance criteria to determine whether to approximate, do P2P, or subdivide
+  void interact(C_iter Ci, C_iter Cj, PairQueue &pairQueue) {
+    vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
+    real Rq = std::sqrt(norm(dX));                              // Scalar distance
+    if( Rq * THETA > Ci->R + Cj->R ) {                          // If distance if far enough
+      approximate(Ci,Cj);                                       //  Use approximate kernels, e.g. M2L, M2P
+    } else if(Ci->NCHILD == 0 && Cj->NCHILD == 0) {             // Else if both cells are leafs
+      evalP2P(Ci,Cj);                                           //  Use P2P
+    } else {                                                    // If cells are close but not leafs
+      Pair pair(Ci,Cj);                                         //  Form a pair of cell iterators
+      pairQueue.push_back(pair);                                //  Push pair to queue
+    }                                                           // End if for multipole acceptance
+  }
+
+//! Dual tree traversal
+  void traverse(Cells &cells, Cells &jcells) {
+    C_iter root = cells.end() - 1;                              // Iterator for root target cell
+    C_iter jroot = jcells.end() - 1;                            // Iterator for root source cell
+    if( IMAGES != 0 ) {                                         // If periodic boundary condition
+      jroot = jcells.end() - 1 - 26 * 27 * (IMAGES - 1);        //  The root is not at the end
+    }                                                           // Endif for periodic boundary condition
+    Ci0 = cells.begin();                                        // Set begin iterator for target cells
+    Cj0 = jcells.begin();                                       // Set begin iterator for source cells
+#if QUEUE
+    listM2L.resize(cells.size());                               // Resize M2L interaction list
+    listM2P.resize(cells.size());                               // Resize M2P interaction list
+    listP2P.resize(cells.size());                               // Resize P2P interaction list
+    flagM2L.resize(cells.size());                               // Resize M2L periodic image flag
+    flagM2P.resize(cells.size());                               // Resize M2P periodic image flag
+    flagP2P.resize(cells.size());                               // Resize P2P periodic image flag
+#endif
+    if( IMAGES == 0 ) {                                         // If free boundary condition
+      Iperiodic = Icenter;                                      //  Set periodic image flag to center
+      Xperiodic = 0;                                            //  Set periodic coordinate offset
+      Pair pair(root,jroot);                                    //  Form pair of root cells
+      traverseQueue(pair);                                      //  Traverse a pair of trees
+    } else {                                                    // If periodic boundary condition
+      int I = 0;                                                //  Initialize index of periodic image
+      for( int ix=-1; ix<=1; ++ix ) {                           //  Loop over x periodic direction
+        for( int iy=-1; iy<=1; ++iy ) {                         //   Loop over y periodic direction
+          for( int iz=-1; iz<=1; ++iz, ++I ) {                  //    Loop over z periodic direction
+            Iperiodic = 1 << I;                                 //     Set periodic image flag
+            Xperiodic[0] = ix * 2 * R0;                         //     Coordinate offset for x periodic direction
+            Xperiodic[1] = iy * 2 * R0;                         //     Coordinate offset for y periodic direction
+            Xperiodic[2] = iz * 2 * R0;                         //     Coordinate offset for z periodic direction
+            Pair pair(root,jroot);                              //     Form pair of root cells
+            traverseQueue(pair);                                //     Traverse a pair of trees
+          }                                                     //    End loop over z periodic direction
+        }                                                       //   End loop over y periodic direction
+      }                                                         //  End loop over x periodic direction
+      for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) {   //  Loop over target cells
+        listM2L[Ci-Ci0].sort();                                 //  Sort interaction list
+        listM2L[Ci-Ci0].unique();                               //  Eliminate duplicate periodic entries
+        listM2P[Ci-Ci0].sort();                                 //  Sort interaction list
+        listM2P[Ci-Ci0].unique();                               //  Eliminate duplicate periodic entries
+        listP2P[Ci-Ci0].sort();                                 //  Sort interaction list
+        listP2P[Ci-Ci0].unique();                               //  Eliminate duplicate periodic entries
+      }                                                         //  End loop over target cells
+    }                                                           // Endif for periodic boundary condition
+  }
+
+//! Traverse neighbor cells only (for cutoff based methods)
+  void neighbor(Cells &cells, Cells &jcells) {
+    C_iter root = cells.end() - 1;                              // Iterator for root target cell
+    C_iter jroot = jcells.end() - 1;                            // Iterator for root source cell
+    Ci0 = cells.begin();                                        // Set begin iterator for target cells
+    Cj0 = jcells.begin();                                       // Set begin iterator for source cells
+    listP2P.resize(cells.size());                               // Resize P2P interaction list
+    flagP2P.resize(cells.size());                               // Resize P2P periodic image flag
+    for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) {     // Loop over target cells
+      if( Ci->NCHILD == 0 ) {                                   //  If cell is a twig
+        int I = 0;                                              //   Initialize index of periodic image
+        for( int ix=-1; ix<=1; ++ix ) {                         //   Loop over x periodic direction
+          for( int iy=-1; iy<=1; ++iy ) {                       //    Loop over y periodic direction
+            for( int iz=-1; iz<=1; ++iz, ++I ) {                //     Loop over z periodic direction
+              Iperiodic = 1 << I;                               //      Set periodic image flag
+              Xperiodic[0] = ix * 2 * R0;                       //      Coordinate offset for x periodic direction
+              Xperiodic[1] = iy * 2 * R0;                       //      Coordinate offset for y periodic direction
+              Xperiodic[2] = iz * 2 * R0;                       //      Coordinate offset for z periodic direction
+              traverseStack(Ci,jroot);                          //      Traverse the source tree
+            }                                                   //     End loop over z periodic direction
+          }                                                     //    End loop over y periodic direction
+        }                                                       //   End loop over x periodic direction
+        for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B) {  //   Loop over all leafs in cell
+          B->TRG[0] -= M_2_SQRTPI * B->SRC * ALPHA;             //    Self term of Ewald real part
+        }                                                       //   End loop over all leafs in cell
+      }                                                         //  End if for twig cells
+      listP2P[Ci-Ci0].sort();                                   //  Sort interaction list
+      listP2P[Ci-Ci0].unique();                                 //  Eliminate duplicate periodic entries
+    }                                                           // End loop over target cells
+  }
+
+  void setSourceBody();                                         //!< Set source buffer for bodies (for GPU)
+  void setSourceCell(bool isM);                                 //!< Set source buffer for cells (for GPU)
+  void setTargetBody(Lists lists, Maps flags);                  //!< Set target buffer for bodies (for GPU)
+  void setTargetCell(Lists lists, Maps flags);                  //!< Set target buffer for cells (for GPU)
+  void getTargetBody(Lists &lists);                             //!< Get body values from target buffer (for GPU)
+  void getTargetCell(Lists &lists, bool isM);                   //!< Get cell values from target buffer (for GPU)
+  void clearBuffers();                                          //!< Clear GPU buffers
+
+  void evalP2P(Bodies &ibodies, Bodies &jbodies, bool onCPU=false);//!< Evaluate all P2P kernels (all pairs)
+  void evalP2M(Cells &cells);                                   //!< Evaluate all P2M kernels
+  void evalM2M(Cells &cells, Cells &jcells);                    //!< Evaluate all M2M kernels
+  void evalM2L(C_iter Ci, C_iter Cj);                           //!< Evaluate on CPU, queue on GPU
+  void evalM2L(Cells &cells);                                   //!< Evaluate queued M2L kernels
+  void evalM2P(C_iter Ci, C_iter Cj);                           //!< Evaluate on CPU, queue on GPU
+  void evalM2P(Cells &cells);                                   //!< Evaluate queued M2P kernels
+  void evalP2P(C_iter Ci, C_iter Cj);                           //!< Evaluate on CPU, queue on GPU
+  void evalP2P(Cells &cells);                                   //!< Evaluate queued P2P kernels (near field)
+  void evalL2L(Cells &cells);                                   //!< Evaluate all L2L kernels
+  void evalL2P(Cells &cells);                                   //!< Evaluate all L2P kernels
+  void evalEwaldReal(C_iter Ci, C_iter Cj);                     //!< Evaluate on CPU, queue on GPU
+  void evalEwaldReal(Cells &cells);                             //!< Evaluate queued Ewald real kernels
+};
+
+#if CPU
+#include "../kernel/CPUEvaluator.cxx"
+#elif GPU
+#include "../kernel/GPUEvaluator.cxx"
+#endif
+
+#undef splitFirst
+#endif
+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef kernel_h
+#define kernel_h
+#include "sort.h"
+#define ODDEVEN(n) ((((n) & 1) == 1) ? -1 : 1)
+
+//! Unified CPU/GPU kernel class
+class KernelBase : public Sort {
+protected:
+  C_iter               Ci0;                                     //!< Begin iterator for target cells
+  C_iter               Cj0;                                     //!< Begin iterator for source cells
+
+  int                  ATOMS;                                   //!< Number of atom types in Van der Waals
+  std::vector<real>    RSCALE;                                  //!< Scaling parameter for Van der Waals
+  std::vector<real>    GSCALE;                                  //!< Scaling parameter for Van der Waals
+  real                 KSIZE;                                   //!< Number of waves in Ewald summation
+  real                 ALPHA;                                   //!< Scaling parameter for Ewald summation
+  real                 SIGMA;                                   //!< Scaling parameter for Ewald summation
+
+  std::vector<int>     keysHost;                                //!< Offsets for rangeHost
+  std::vector<int>     rangeHost;                               //!< Offsets for sourceHost
+  std::vector<gpureal> sourceHost;                              //!< Sources on host
+  std::vector<gpureal> targetHost;                              //!< Targets on host
+  std::vector<gpureal> constHost;                               //!< Constants on host
+  Map                  sourceBegin;                             //!< Define map for offset of source cells
+  Map                  sourceSize;                              //!< Define map for size of source cells
+  Map                  targetBegin;                             //!< Define map for offset of target cells
+  size_t               keysDevcSize;                            //!< Size of offsets for rangeDevc
+  size_t               rangeDevcSize;                           //!< Size of offsets for sourceDevc
+  size_t               sourceDevcSize;                          //!< Size of sources on device
+  size_t               targetDevcSize;                          //!< Size of targets on device
+  int                 *keysDevc;                                //!< Offsets for rangeDevc
+  int                 *rangeDevc;                               //!< Offsets for sourceDevc
+  gpureal             *sourceDevc;                              //!< Sources on device
+  gpureal             *targetDevc;                              //!< Targets on device
+
+  real *factorial;                                              //!< Factorial
+  real *prefactor;                                              //!< \f$ \sqrt{ \frac{(n - |m|)!}{(n + |m|)!} } \f$
+  real *Anm;                                                    //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
+  complex *Cnm;                                                 //!< M2L translation matrix \f$ C_{jn}^{km} \f$
+public:
+  vect X0;                                                      //!< Center of root cell
+  real R0;                                                      //!< Radius of root cell
+
+private:
+  real getBmax(vect const&X, C_iter C) const {
+    real rad = C->R;
+    real dx = rad+std::abs(X[0]-C->X[0]);
+    real dy = rad+std::abs(X[1]-C->X[1]);
+    real dz = rad+std::abs(X[2]-C->X[2]);
+    return std::sqrt( dx*dx + dy*dy + dz*dz );
+  }
+
+protected:
+  void setCenter(C_iter C) const {
+    real m = 0;
+    vect X = 0;
+    for( B_iter B=C->LEAF; B!=C->LEAF+C->NCLEAF; ++B ) {
+      m += std::abs(B->SRC);
+      X += B->X * std::abs(B->SRC);
+    }
+    for( C_iter c=Cj0+C->CHILD; c!=Cj0+C->CHILD+C->NCHILD; ++c ) {
+      m += std::abs(c->M[0]);
+      X += c->X * std::abs(c->M[0]);
+    }
+    X /= m;
+    C->R = getBmax(X,C);
+    C->X = X;
+  }
+
+//! Evaluate solid harmonics \f$ r^n Y_{n}^{m} \f$
+  void evalMultipole(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
+    const complex I(0.,1.);                                     // Imaginary unit
+    real x = std::cos(alpha);                                   // x = cos(alpha)
+    real y = std::sin(alpha);                                   // y = sin(alpha)
+    real fact = 1;                                              // Initialize 2 * m + 1
+    real pn = 1;                                                // Initialize Legendre polynomial Pn
+    real rhom = 1;                                              // Initialize rho^m
+    for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
+      complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
+      real p = pn;                                              //  Associated Legendre polynomial Pnm
+      int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
+      int nmn = m * m;                                          //  Index of Ynm for m < 0
+      Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^m * Ynm for m > 0
+      Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
+      real p1 = p;                                              //  Pnm-1
+      p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
+      YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
+      rhom *= rho;                                              //  rho^m
+      real rhon = rhom;                                         //  rho^n
+      for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
+        int npm = n * n + n + m;                                //   Index of Ynm for m > 0
+        int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
+        Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm
+        Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
+        real p2 = p1;                                           //   Pnm-2
+        p1 = p;                                                 //   Pnm-1
+        p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
+        YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
+        rhon *= rho;                                            //   Update rho^n
+      }                                                         //  End loop over n in Ynm
+      pn = -pn * fact * y;                                      //  Pn
+      fact += 2;                                                //  2 * m + 1
+    }                                                           // End loop over m in Ynm
+  }
+
+//! Evaluate singular harmonics \f$ r^{-n-1} Y_n^m \f$
+  void evalLocal(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
+    const complex I(0.,1.);                                     // Imaginary unit
+    real x = std::cos(alpha);                                   // x = cos(alpha)
+    real y = std::sin(alpha);                                   // y = sin(alpha)
+    real fact = 1;                                              // Initialize 2 * m + 1
+    real pn = 1;                                                // Initialize Legendre polynomial Pn
+    real rhom = 1.0 / rho;                                      // Initialize rho^(-m-1)
+    for( int m=0; m!=2*P; ++m ) {                               // Loop over m in Ynm
+      complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
+      real p = pn;                                              //  Associated Legendre polynomial Pnm
+      int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
+      int nmn = m * m;                                          //  Index of Ynm for m < 0
+      Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^(-m-1) * Ynm for m > 0
+      Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
+      real p1 = p;                                              //  Pnm-1
+      p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
+      YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
+      rhom /= rho;                                              //  rho^(-m-1)
+      real rhon = rhom;                                         //  rho^(-n-1)
+      for( int n=m+1; n!=2*P; ++n ) {                           //  Loop over n in Ynm
+        int npm = n * n + n + m;                                //   Index of Ynm for m > 0
+        int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
+        Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm for m > 0
+        Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
+        real p2 = p1;                                           //   Pnm-2
+        p1 = p;                                                 //   Pnm-1
+        p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
+        YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
+        rhon /= rho;                                            //   rho^(-n-1)
+      }                                                         //  End loop over n in Ynm
+      pn = -pn * fact * y;                                      //  Pn
+      fact += 2;                                                //  2 * m + 1
+    }                                                           // End loop over m in Ynm
+  }
+
+public:
+//! Constructor
+  KernelBase() : Ci0(), Cj0(), ATOMS(), RSCALE(), GSCALE(), KSIZE(), ALPHA(), SIGMA(),
+                 keysHost(), rangeHost(), sourceHost(), targetHost(), constHost(),
+                 sourceBegin(), sourceSize(), targetBegin(),
+                 keysDevcSize(0), rangeDevcSize(0),
+                 sourceDevcSize(0), targetDevcSize(0),
+                 keysDevc(), rangeDevc(), sourceDevc(), targetDevc(),
+                 factorial(), prefactor(), Anm(), Cnm(),
+                 X0(0), R0(0) {}
+//! Destructor
+  ~KernelBase() {}
+//! Copy constructor
+  KernelBase(const KernelBase&) : Sort(), Ci0(), Cj0(), ATOMS(), RSCALE(), GSCALE(), KSIZE(), ALPHA(), SIGMA(),
+                 keysHost(), rangeHost(), sourceHost(), targetHost(), constHost(),
+                 sourceBegin(), sourceSize(), targetBegin(),
+                 keysDevcSize(0), rangeDevcSize(0),
+                 sourceDevcSize(0), targetDevcSize(0),
+                 keysDevc(), rangeDevc(), sourceDevc(), targetDevc(),
+                 factorial(), prefactor(), Anm(), Cnm(),
+                 X0(0), R0(0) {}
+//! Overload assignment
+  KernelBase &operator=(const KernelBase) {return *this;}
+
+//! Set center of root cell
+  void setX0(vect x0) {X0 = x0;}
+//! Set radius of root cell
+  void setR0(real r0) {R0 = r0;}
+
+//! Get center of root cell
+  vect getX0() const {return X0;}
+//! Get radius of root cell
+  real getR0() const {return R0;}
+
+//! Set center and size of root cell
+  void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
+    vect xmin,xmax;                                             // Min,Max of domain
+    B_iter B = bodies.begin();                                  // Reset body iterator
+    xmin = xmax = B->X;                                         // Initialize xmin,xmax
+    for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
+      for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
+        if     (B->X[d] < xmin[d]) xmin[d] = B->X[d];           //   Determine xmin
+        else if(B->X[d] > xmax[d]) xmax[d] = B->X[d];           //   Determine xmax
+      }                                                         //  End loop over each dimension
+    }                                                           // End loop over bodies
+    if( IMAGES != 0 ) {                                         // If periodic boundary condition
+      if( xmin[0] < x0[0]-r0 || x0[0]+r0 < xmax[0]              //  Check for outliers in x direction
+       || xmin[1] < x0[1]-r0 || x0[1]+r0 < xmax[1]              //  Check for outliers in y direction
+       || xmin[2] < x0[2]-r0 || x0[2]+r0 < xmax[2] ) {          //  Check for outliers in z direction
+        std::cout << "Error: Particles located outside periodic domain : " << std::endl;// Print error message
+        std::cout << xmin << std::endl;
+        std::cout << xmax << std::endl;
+      }                                                         //  End if for outlier checking
+      X0 = x0;                                                  //  Center is [0, 0, 0]
+      R0 = r0;                                                  //  Radius is r0
+    } else {
+      for( int d=0; d!=3; ++d ) {                               // Loop over each dimension
+        X0[d] = (xmax[d] + xmin[d]) / 2;                        // Calculate center of domain
+        X0[d] = int(X0[d]+.5);                                  //  Shift center to nearest integer
+        R0 = std::max(xmax[d] - X0[d], R0);                     //  Calculate max distance from center
+        R0 = std::max(X0[d] - xmin[d], R0);                     //  Calculate max distance from center
+      }                                                         // End loop over each dimension
+      R0 *= 1.000001;                                           // Add some leeway to root radius
+    }                                                           // Endif for periodic boundary condition
+  }
+
+//! Precalculate M2L translation matrix
+  void preCalculation() {
+    const complex I(0.,1.);                                     // Imaginary unit
+    factorial = new real  [P];                                  // Factorial
+    prefactor = new real  [4*P*P];                              // sqrt( (n - |m|)! / (n + |m|)! )
+    Anm       = new real  [4*P*P];                              // (-1)^n / sqrt( (n + m)! / (n - m)! )
+    Cnm       = new complex [P*P*P*P];                          // M2L translation matrix Cjknm
+
+    factorial[0] = 1;                                           // Initialize factorial
+    for( int n=1; n!=P; ++n ) {                                 // Loop to P
+      factorial[n] = factorial[n-1] * n;                        //  n!
+    }                                                           // End loop to P
+
+    for( int n=0; n!=2*P; ++n ) {                               // Loop over n in Anm
+      for( int m=-n; m<=n; ++m ) {                              //  Loop over m in Anm
+        int nm = n*n+n+m;                                       //   Index of Anm
+        int nabsm = abs(m);                                     //   |m|
+        real fnmm = EPS;                                        //   Initialize (n - m)!
+        for( int i=1; i<=n-m; ++i ) fnmm *= i;                  //   (n - m)!
+        real fnpm = EPS;                                        //   Initialize (n + m)!
+        for( int i=1; i<=n+m; ++i ) fnpm *= i;                  //   (n + m)!
+        real fnma = 1.0;                                        //   Initialize (n - |m|)!
+        for( int i=1; i<=n-nabsm; ++i ) fnma *= i;              //   (n - |m|)!
+        real fnpa = 1.0;                                        //   Initialize (n + |m|)!
+        for( int i=1; i<=n+nabsm; ++i ) fnpa *= i;              //   (n + |m|)!
+        prefactor[nm] = std::sqrt(fnma/fnpa);                   //   sqrt( (n - |m|)! / (n + |m|)! )
+        Anm[nm] = ODDEVEN(n)/std::sqrt(fnmm*fnpm);              //   (-1)^n / sqrt( (n + m)! / (n - m)! )
+      }                                                         //  End loop over m in Anm
+    }                                                           // End loop over n in Anm
+
+    for( int j=0, jk=0, jknm=0; j!=P; ++j ) {                   // Loop over j in Cjknm
+      for( int k=-j; k<=j; ++k, ++jk ){                         //  Loop over k in Cjknm
+        for( int n=0, nm=0; n!=P; ++n ) {                       //   Loop over n in Cjknm
+          for( int m=-n; m<=n; ++m, ++nm, ++jknm ) {            //    Loop over m in Cjknm
+            const int jnkm = (j+n)*(j+n)+j+n+m-k;               //     Index C_{j+n}^{m-k}
+            Cnm[jknm] = std::pow(I,real(abs(k-m)-abs(k)-abs(m)))//     Cjknm
+                      * real(ODDEVEN(j)*Anm[nm]*Anm[jk]/Anm[jnkm]) * EPS;
+          }                                                     //    End loop over m in Cjknm
+        }                                                       //   End loop over n in Cjknm
+      }                                                         //  End loop over in k in Cjknm
+    }                                                           // End loop over in j in Cjknm
+  }
+
+//! Free temporary allocations
+  void postCalculation() {
+    delete[] factorial;                                         // Free factorial
+    delete[] prefactor;                                         // Free sqrt( (n - |m|)! / (n + |m|)! )
+    delete[] Anm;                                               // Free (-1)^n / sqrt( (n + m)! / (n - m)! )
+    delete[] Cnm;                                               // Free M2L translation matrix Cjknm
+  }
+
+//! Set paramters for Van der Waals
+  void setVanDerWaals(int atoms, double *rscale, double *gscale) {
+    assert(atoms <= 16);                                        // Change GPU constant memory alloc if needed
+    THETA = .1;                                                 // Force opening angle to be small
+    ATOMS = atoms;                                              // Set number of atom types
+    RSCALE.resize(ATOMS*ATOMS);                                 // Resize rscale vector
+    GSCALE.resize(ATOMS*ATOMS);                                 // Resize gscale vector
+    for( int i=0; i!=ATOMS*ATOMS; ++i ) {                       // Loop over scale vector
+      RSCALE[i] = rscale[i];                                    //  Set rscale vector
+      GSCALE[i] = gscale[i];                                    //  Set gscale vector
+    }                                                           // End loop over scale vector
+  }
+
+//! Set paramters for Ewald summation
+  void setEwald(real ksize, real alpha, real sigma) {
+    KSIZE = ksize;                                              // Set number of waves
+    ALPHA = alpha;                                              // Set scaling parameter
+    SIGMA = sigma;                                              // Set scaling parameter
+  }
+
+};
+
+template<Equation equation>
+class Kernel : public KernelBase {
+public:
+  void initialize();                                            //!< Initialize kernels
+  void P2M(C_iter Ci);                                          //!< Evaluate P2M kernel on CPU
+  void M2M(C_iter Ci);                                          //!< Evaluate M2M kernel on CPU
+  void M2L(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2L kernel on CPU
+  void M2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2P kernel on CPU
+  void P2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate P2P kernel on CPU
+  void L2L(C_iter Ci) const;                                    //!< Evaluate L2L kernel on CPU
+  void L2P(C_iter Ci) const;                                    //!< Evaluate L2P kernel on CPU
+  void EwaldReal(C_iter Ci, C_iter Cj) const;                   //!< Evaluate Ewald real part on CPU
+  void EwaldWave(Bodies &bodies) const;                         //!< Evaluate Ewald wave part on CPU
+  void P2M();                                                   //!< Evaluate P2M kernel on GPU
+  void M2M();                                                   //!< Evaluate M2M kernel on GPU
+  void M2L();                                                   //!< Evaluate M2L kernel on GPU
+  void M2P();                                                   //!< Evaluate M2P kernel on GPU
+  void P2P();                                                   //!< Evalaute P2P kernel on GPU
+  void L2L();                                                   //!< Evaluate L2L kernel on GPU
+  void L2P();                                                   //!< Evaluate L2P kernel on GPU
+  void EwaldReal();                                             //!< Evaluate Ewald real part on GPU
+  void EwaldWave();                                             //!< Evalaute Ewald wave part on GPU
+  void finalize();                                              //!< Finalize kernels
+
+  void allocate();                                              //!< Allocate GPU variables
+  void hostToDevice();                                          //!< Copy from host to device
+  void deviceToHost();
+};
+
+#endif
+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef logger_h
+#define logger_h
+#include <sys/time.h>
+#include "types.h"
+
+//! Timer and Trace logger
+class Logger {
+private:
+  std::ofstream   timerFile;                                    //!< File ID to store log
+  Timer           beginTimer;                                   //!< Timer base value
+  Timer           timer;                                        //!< Stores timings for all events
+  Traces          traces;                                       //!< Stores traces for all events
+  pthread_mutex_t mutex;                                        //!< Pthread communicator
+
+//! Timer function
+  double get_time() const {
+    struct timeval tv;                                          // Time value
+    gettimeofday(&tv, NULL);                                    // Get time of day in seconds and microseconds
+    return double(tv.tv_sec+tv.tv_usec*1e-6);                   // Combine seconds and microseconds and return
+  }
+
+public:
+  int stringLength;                                             //!< Max length of event name
+  bool printNow;                                                //!< Switch to print timings
+
+//! Constructor
+  Logger() : timerFile("time.dat"),                             // Open timer log file
+             beginTimer(), timer(), traces(), mutex(),          // Initializing class variables (empty)
+             stringLength(20),                                  // Max length of event name
+             printNow(false) {                                  // Don't print timings by default
+    pthread_mutex_init(&mutex,NULL);                            // Initialize pthread communicator
+  }
+//! Destructor
+  ~Logger() {
+    timerFile.close();                                          // Close timer log file
+  }
+
+//! Start timer for given event
+  inline void startTimer(std::string event) {
+    beginTimer[event] = get_time();                             // Get time of day and store in beginTimer
+  }
+
+//! Stop timer for given event
+  double stopTimer(std::string event, bool print=false) {
+    double endTimer = get_time();                               // Get time of day and store in endTimer
+    timer[event] += endTimer - beginTimer[event];               // Accumulate event time to timer
+    if(print) std::cout << std::setw(stringLength) << std::left // Set format
+                        << event << " : " << timer[event] << std::endl;// Print event and timer to screen
+    return endTimer - beginTimer[event];                        // Return the event time
+  }
+
+//! Erase entry in timer
+  inline void eraseTimer(std::string event) {
+    timer.erase(event);                                         // Erase event from timer
+  }
+
+//! Erase all events in timer
+  inline void resetTimer() {
+    timer.clear();                                              // Clear timer
+  }
+
+//! Print timings of a specific event
+  inline void printTime(std::string event) {
+    std::cout << std::setw(stringLength) << std::left           // Set format
+              << event << " : " << timer[event] << std::endl;   // Print event and timer
+  }
+
+//! Print timings of all events
+  inline void printAllTime() {
+    for( TI_iter E=timer.begin(); E!=timer.end(); ++E ) {       // Loop over all events
+      std::cout << std::setw(stringLength) << std::left         //  Set format
+                << E->first << " : " << E->second << std::endl; //  Print event and timer
+    }                                                           // End loop over all events
+  }
+
+//! Write timings of all events
+  inline void writeTime() {
+    for( TI_iter E=timer.begin(); E!=timer.end(); ++E ) {       // Loop over all events
+      timerFile << std::setw(stringLength) << std::left         //  Set format
+                << E->first << " " << E->second << std::endl;   //  Print event and timer
+    }                                                           // End loop over all events
+  }
+
+//! Start PAPI event
+  inline void startPAPI() {
+#if PAPI
+    int events[3] = { PAPI_L2_DCM, PAPI_L2_DCA, PAPI_TLB_DM };  // PAPI event type
+    PAPI_library_init(PAPI_VER_CURRENT);                        // PAPI initialize
+    PAPI_create_eventset(&PAPIEVENT);                           // PAPI create event set
+    PAPI_add_events(PAPIEVENT, events, 3);                      // PAPI add events
+    PAPI_start(PAPIEVENT);                                      // PAPI start
+#endif
+  }
+
+//! Stop PAPI event
+  inline void stopPAPI() {
+#if PAPI
+    long long values[3] = {0,0,0};                              // Values for each event
+    PAPI_stop(PAPIEVENT,values);                                // PAPI stop
+    std::cout << "L2 Miss: " << values[0]                       // Print L2 Misses
+              << " L2 Access: " << values[1]                    // Print L2 Access
+              << " TLB Miss: " << values[2] << std::endl;       // Print TLB Misses
+#endif
+  }
+
+//! Start tracer for given event
+  inline void startTracer(ThreadTrace &beginTrace) {
+    pthread_mutex_lock(&mutex);                                 // Lock shared variable access
+    beginTrace[pthread_self()] = get_time();                    // Get time of day and store in beginTrace
+    pthread_mutex_unlock(&mutex);                               // Unlock shared variable access
+  }
+
+//! Stop tracer for given event
+  inline void stopTracer(ThreadTrace &beginTrace, int color) {
+    pthread_mutex_lock(&mutex);                                 // Lock shared variable access
+    Trace trace;                                                // Define trace structure
+    trace.thread = pthread_self();                              // Store pthread id
+    trace.begin  = beginTrace[pthread_self()];                  // Store tic
+    trace.end    = get_time();                                  // Store toc
+    trace.color  = color;                                       // Store color of event
+    traces.push(trace);                                         // Push trace to queue of traces
+    pthread_mutex_unlock(&mutex);                               // Unlock shared variable access
+  }
+
+//! Write traces of all events
+  inline void writeTrace() {
+    char fname[256];                                            // File name
+    sprintf(fname,"trace%4.4d.svg",MPIRANK);                    // Create file name for trace
+    std::ofstream traceFile(fname);                             // Open trace log file
+    traceFile << "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" // Header statements for trace log file
+        << "<!DOCTYPE svg PUBLIC \"-_W3C_DTD SVG 1.0_EN\" \"http://www.w3.org/TR/SVG/DTD/svg10.dtd\">\n"
+        << "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n"
+        << "  width=\"200mm\" height=\"40mm\" viewBox=\"0 0 20000 4000\">\n"
+        << "  <g>\n";
+    int num_thread = 0;                                         // Counter for number of threads to trace
+    ThreadMap threadMap;                                        // Map pthread ID to thread ID
+    double base = traces.front().begin;                         // Base time
+    double scale = 30000.0;                                     // Scale the length of bar plots
+    while( !traces.empty() ) {                                  // While queue of traces is not empty
+      Trace trace = traces.front();                             //  Get trace at front of the queue
+      traces.pop();                                             //  Pop trace at front
+      pthread_t thread = trace.thread;                          //  Get pthread ID of trace
+      double begin  = trace.begin;                              //  Get begin time of trace
+      double end    = trace.end;                                //  Get end time of trace
+      int    color  = trace.color;                              //  Get color of trace
+      if( threadMap[thread] == 0 ) {                            //  If it's a new pthread ID
+        threadMap[thread] = ++num_thread;                       //   Map it to an incremented thread ID
+      }                                                         //  End if for new pthread ID
+      begin -= base;                                            //  Subtract base time from begin time
+      end   -= base;                                            //  Subtract base time from end time
+      traceFile << "    <rect x=\"" << begin * scale            //  x position of bar plot
+          << "\" y=\"" << (threadMap[thread] - 1) * 100.0       //  y position of bar plot
+          << "\" width=\"" << (end - begin) * scale             //  width of bar
+          << "\" height=\"90.0\" fill=\"#"<< std::setfill('0') << std::setw(6) << std::hex << color// height of bar
+          << "\" stroke=\"#000000\" stroke-width=\"1\"/>\n";    //  stroke color and width
+    }                                                           // End while loop for queue of traces
+    traceFile << "  </g>\n" "</svg>\n";                         // Footer for trace log file 
+    traceFile.close();                                          // Close trace log file
+  }
+};
+
+#endif
+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef mympi_h
+#define mympi_h
+#include <mpi.h>
+#include <typeinfo>
+#include "types.h"
+
+//! Custom MPI utilities
+class MyMPI {
+private:
+  int       EXTERNAL;                                           //!< Flag to indicate external MPI_Init/Finalize
+
+protected:
+  int       MPISIZES;                                           //!< Number of MPI processes for split communicator
+  int       MPIRANKS;                                           //!< Rank of current MPI process for split communicator
+  const int WAIT;                                               //!< Waiting time between output of different ranks
+
+public:
+//! Constructor, initialize WAIT time
+  MyMPI() : EXTERNAL(0), MPISIZES(0), MPIRANKS(0), WAIT(100) {  // Constructor, initialize WAIT time
+    int argc(0);                                                // Dummy argument count
+    char **argv;                                                // Dummy argument value
+    MPI_Initialized(&EXTERNAL);                                 // Check if MPI_Init has been called
+    if(!EXTERNAL) MPI_Init(&argc,&argv);                        // Initialize MPI communicator
+    MPI_Comm_size(MPI_COMM_WORLD,&MPISIZE);                     // Get number of MPI processes
+    MPI_Comm_rank(MPI_COMM_WORLD,&MPIRANK);                     // Get rank of current MPI process
+    DEVICE = MPIRANK % GPUS;                                    // Get GPU device ID from MPI rank
+  }
+
+//! Destructor
+  ~MyMPI() {
+    if(!EXTERNAL) MPI_Finalize();                               // Finalize MPI communicator
+  }
+
+//! If n is power of two return true
+  bool isPowerOfTwo(const int n) {
+    return ((n != 0) && !(n & (n - 1)));                        // Decrement and compare bits
+  }
+
+//! Split range and return partial range
+  void splitRange(int &begin, int &end, int iSplit, int numSplit) {
+    int size = end - begin;                                     // Size of range
+    int increment = size / numSplit;                            // Increment of splitting
+    int remainder = size % numSplit;                            // Remainder of splitting
+    begin += iSplit * increment + std::min(iSplit,remainder);   // Increment the begin counter
+    end = begin + increment;                                    // Increment the end counter
+    if( remainder > iSplit ) end++;                             // Adjust the end counter for remainder
+  }
+
+//! Get MPI data type
+  template<typename T>
+  MPI_Datatype getType(T object) {
+    MPI_Datatype type = MPI_BYTE;                               // MPI data type
+    if       ( typeid(object) == typeid(char) ) {               // If data type is char
+      type = MPI_CHAR;                                          //  use MPI_CHAR
+    } else if( typeid(object) == typeid(short) ) {              // If data type is short
+      type = MPI_SHORT;                                         //  use MPI_SHORT
+    } else if( typeid(object) == typeid(int) ) {                // If data type is int
+      type = MPI_INT;                                           //  use MPI_INT
+    } else if( typeid(object) == typeid(long) ) {               // If data type is long
+      type = MPI_LONG;                                          //  use MPI_LONG
+    } else if( typeid(object) == typeid(long long) ) {          // If data type is long long
+      type = MPI_LONG_LONG;                                     //  use MPI_LONG_LONG
+    } else if( typeid(object) == typeid(unsigned char) ) {      // If data type is unsigned char
+      type = MPI_UNSIGNED_CHAR;                                 //  use MPI_UNSIGNED_CHAR
+    } else if( typeid(object) == typeid(unsigned short) ) {     // If data type is unsigned short
+      type = MPI_UNSIGNED_SHORT;                                //  use MPI_UNSIGNED_SHORT
+    } else if( typeid(object) == typeid(unsigned int) ) {       // If data type is unsigned int
+      type = MPI_UNSIGNED;                                      //  use MPI_UNSIGNED
+    } else if( typeid(object) == typeid(unsigned long) ) {      // If data type is unsigned long
+      type = MPI_UNSIGNED_LONG;                                 //  use MPI_UNSIGNED_LONG
+    } else if( typeid(object) == typeid(unsigned long long) ) { // If data type is unsigned long long
+      type = MPI_UNSIGNED_LONG_LONG;                            //  use MPI_UNSIGNED_LONG_LONG
+    } else if( typeid(object) == typeid(float) ) {              // If data type is float
+      type = MPI_FLOAT;                                         //  use MPI_FLOAT
+    } else if( typeid(object) == typeid(double) ) {             // If data type is double
+      type = MPI_DOUBLE;                                        //  use MPI_DOUBLE
+    } else if( typeid(object) == typeid(long double) ) {        // If data type is long double
+      type = MPI_LONG_DOUBLE;                                   //  use MPI_LONG_DOUBLE
+    } else if( typeid(object) == typeid(std::complex<float>) ) {// If data type is complex<float>
+      type = MPI_COMPLEX;                                       //  use MPI_COMPLEX
+    } else if( typeid(object) == typeid(std::complex<double>) ) {// If data type is compelx<double>
+      type = MPI_DOUBLE_COMPLEX;                                //  use MPI_DOUBLE_COMPLEX
+    }                                                           // Endif for data type
+    return type;                                                // Return MPI data type
+  }
+
+//! Print a scalar value on all ranks
+  template<typename T>
+  void print(T data) {
+    for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks
+      MPI_Barrier(MPI_COMM_WORLD);                              //  Sync processes
+      usleep(WAIT);                                             //  Wait "WAIT" milliseconds
+      if( MPIRANK == irank ) std::cout << data << " ";          //  If it's my turn print "data"
+    }                                                           // End loop over ranks
+    MPI_Barrier(MPI_COMM_WORLD);                                // Sync processes
+    usleep(WAIT);                                               // Wait "WAIT" milliseconds
+    if( MPIRANK == 0 ) std::cout << std::endl;                  // New line
+  }
+
+//! Print a scalar value on irank
+  template<typename T>
+  void print(T data, const int irank) {
+    MPI_Barrier(MPI_COMM_WORLD);                                // Sync processes
+    usleep(WAIT);                                               // Wait "WAIT" milliseconds
+    if( MPIRANK == irank ) std::cout << data;                   // If it's my rank print "data"
+  }
+
+//! Print a vector value on all ranks
+  template<typename T>
+  void print(T *data, const int begin, const int end) {
+    for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks
+      MPI_Barrier(MPI_COMM_WORLD);                              //  Sync processes
+      usleep(WAIT);                                             //  Wait "WAIT" milliseconds
+      if( MPIRANK == irank ) {                                  //  If it's my turn to print
+        std::cout << MPIRANK << " : ";                          //   Print rank
+        for( int i=begin; i!=end; ++i ) {                       //   Loop over data
+          std::cout << data[i] << " ";                          //    Print data[i]
+        }                                                       //   End loop over data
+        std::cout << std::endl;                                 //   New line
+      }                                                         //  Endif for my turn
+    }                                                           // End loop over ranks
+  }
+
+//! Print a vector value on irank
+  template<typename T>
+  void print(T *data, const int begin, const int end, const int irank) {
+    MPI_Barrier(MPI_COMM_WORLD);                                // Sync processes
+    usleep(WAIT);                                               // Wait "WAIT" milliseconds
+    if( MPIRANK == irank ) {                                    // If it's my rank
+      std::cout << MPIRANK << " : ";                            //  Print rank
+      for( int i=begin; i!=end; ++i ) {                         //  Loop over data
+        std::cout << data[i] << " ";                            //   Print data[i]
+      }                                                         //  End loop over data
+      std::cout << std::endl;                                   //  New line
+    }                                                           // Endif for my rank
+  }
+};
+
+#endif

include/parallelfmm.h

+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef parallelfmm_h
+#define parallelfmm_h
+#include "partition.h"
+
+//! Handles all the communication of local essential trees
+template<Equation equation>
+class ParallelFMM : public Partition<equation> {
+private:
+  std::vector<int>    sendBodyCnt;                              //!< Vector of body send counts
+  std::vector<int>    sendBodyDsp;                              //!< Vector of body send displacements
+  std::vector<int>    recvBodyCnt;                              //!< Vector of body recv counts
+  std::vector<int>    recvBodyDsp;                              //!< Vector of body recv displacements
+  std::vector<int>    sendBodyRanks;                            //!< Vector of ranks to send bodies to
+  std::vector<int>    sendBodyCellCnt;                          //!< Vector of send counts for cells of bodies
+  std::vector<C_iter> sendBodyCells;                            //!< Vector of cell iterators for cells of bodies to send
+  std::vector<int>    sendCellCnt;                              //!< Vector of cell send counts
+  std::vector<int>    sendCellDsp;                              //!< Vector of cell send displacements
+  std::vector<int>    recvCellCnt;                              //!< Vector of cell recv counts
+  std::vector<int>    recvCellDsp;                              //!< Vector of cell recv displacements
+  std::vector<vect>   xminAll;                                  //!< Buffer for gathering XMIN
+  std::vector<vect>   xmaxAll;                                  //!< Buffer for gathering XMAX
+
+  JBodies sendBodies;                                           //!< Send buffer for bodies
+  JBodies recvBodies;                                           //!< Recv buffer for bodies
+  JCells  sendCells;                                            //!< Send buffer for cells
+  JCells  recvCells;                                            //!< Recv buffer for cells
+
+public:
+  using Kernel<equation>::printNow;                             //!< Switch to print timings
+  using Kernel<equation>::startTimer;                           //!< Start timer for given event
+  using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
+  using Kernel<equation>::sortBodies;                           //!< Sort bodies according to cell index
+  using Kernel<equation>::sortCells;                            //!< Sort cells according to cell index
+  using Kernel<equation>::R0;                                   //!< Radius of root cell
+  using TreeStructure<equation>::buffer;                        //!< Buffer for MPI communication & sorting
+  using TreeStructure<equation>::getLevel;                      //!< Get level from cell index
+  using TreeStructure<equation>::getCenter;                     //!< Get cell center and radius from cell index
+  using TreeStructure<equation>::bodies2twigs;                  //!< Group bodies into twig cells
+  using TreeStructure<equation>::twigs2cells;                   //!< Link twigs bottomup to create all cells in tree
+  using Partition<equation>::isPowerOfTwo;                      //!< If n is power of two return true
+  using Partition<equation>::splitRange;                        //!< Split range and return partial range
+  using Partition<equation>::print;                             //!< Print in MPI
+  using Partition<equation>::LEVEL;                             //!< Level of the MPI process binary tree
+  using Partition<equation>::XMIN;                              //!< Minimum position vector of bodies
+  using Partition<equation>::XMAX;                              //!< Maximum position vector of bodies
+  using Partition<equation>::nprocs;                            //!< Number of processes in the two split groups
+  using Partition<equation>::color;                             //!< Color for hypercube communicators
+  using Partition<equation>::key;                               //!< Key for hypercube communicators
+  using Partition<equation>::MPI_COMM;                          //!< Hypercube communicators
+
+private:
+//! Gather bounds of other domain
+  void gatherBounds() {
+    xminAll.resize(MPISIZE);                                    // Resize buffer for gathering xmin
+    xmaxAll.resize(MPISIZE);                                    // Resize buffer for gathering xmax
+    sendBodyCnt.resize(MPISIZE);                                // Resize vector of body send counts
+    sendBodyDsp.resize(MPISIZE);                                // Resize vector of body send displacements
+    recvBodyCnt.resize(MPISIZE);                                // Resize vector of body recv counts
+    recvBodyDsp.resize(MPISIZE);                                // Resize vector of body recv displacements
+    sendCellCnt.resize(MPISIZE);                                // Resize vector of cell send counts
+    sendCellDsp.resize(MPISIZE);                                // Resize vector of cell send displacements
+    recvCellCnt.resize(MPISIZE);                                // Resize vector of cell recv counts
+    recvCellDsp.resize(MPISIZE);                                // Resize vector of cell recv displacements
+    MPI_Datatype MPI_TYPE = getType(XMIN[LEVEL][0]);            // Get MPI data type
+    MPI_Allgather(&XMIN[LEVEL][0],3,MPI_TYPE,                   // Gather XMIN
+                  &xminAll[0][0],3,MPI_TYPE,MPI_COMM_WORLD);
+    MPI_Allgather(&XMAX[LEVEL][0],3,MPI_TYPE,                   // Gather XMAX
+                  &xmaxAll[0][0],3,MPI_TYPE,MPI_COMM_WORLD);
+  }
+
+//! Get neighbor ranks to send to
+  void getSendRank(Cells &cells) {
+    sendBodyRanks.clear();                                      // Clear send ranks
+    sendBodyCellCnt.clear();                                    // Clear send counts
+    sendBodyCells.clear();                                      // Clear send body cells
+    int oldsize = 0;                                            // Per rank offset of the number of cells to send
+    for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks
+      int ic = 0;                                               //  Initialize neighbor dimension counter
+      for( int d=0; d!=3; ++d ) {                               //  Loop over dimensions
+        if(xminAll[irank][d] < 2 * XMAX[LEVEL][d] - XMIN[LEVEL][d] &&// If the two domains are touching or overlapping
+           xmaxAll[irank][d] > 2 * XMIN[LEVEL][d] - XMAX[LEVEL][d]) {// in all dimensions, they are neighboring domains
+          ic++;                                                 //    Increment neighbor dimension counter
+        }                                                       //   Endif for overlapping domains
+      }                                                         //  End loop over dimensions
+      ic = 3;
+      if( ic == 3 && irank != MPIRANK ) {                       //  If ranks are neighbors in all dimensions
+        for( C_iter C=cells.begin(); C!=cells.end(); ++C ) {    //   Loop over cells
+          if( C->NCHILD == 0 ) {                                //    If cell is a twig
+            bool send = false;                                  //     Initialize logical for sending
+            if( IMAGES == 0 ) {                                 //     If free boundary condition
+              Xperiodic = 0;                                    //      Set periodic coordinate offset
+              real R = getDistance(C,xminAll[irank],xmaxAll[irank]);//  Get distance to other domain
+              send |= CLET * C->R > THETA * R - EPS2;           //      If the cell seems close enough for P2P
+            } else {                                            //     If periodic boundary condition
+              for( int ix=-1; ix<=1; ++ix ) {                   //      Loop over x periodic direction
+                for( int iy=-1; iy<=1; ++iy ) {                 //       Loop over y periodic direction
+                  for( int iz=-1; iz<=1; ++iz ) {               //        Loop over z periodic direction
+                    Xperiodic[0] = ix * 2 * R0;                 //         Coordinate offset for x periodic direction
+                    Xperiodic[1] = iy * 2 * R0;                 //         Coordinate offset for y periodic direction
+                    Xperiodic[2] = iz * 2 * R0;                 //         Coordinate offset for z periodic direction
+                    real R = getDistance(C,xminAll[irank],xmaxAll[irank]);// Get distance to other domain
+                    send |= CLET * C->R > THETA * R - EPS2;     //         If the cell seems close enough for P2P
+                  }                                             //        End loop over z periodic direction
+                }                                               //       End loop over y periodic direction
+              }                                                 //      End loop over x periodic direction
+            }                                                   //     Endif for periodic boundary condition
+            if( send ) {                                        //     If the cell seems close enough for P2P
+              sendBodyCells.push_back(C);                       //      Add cell iterator to scells
+            }                                                   //     Endif for cell distance
+          }                                                     //    Endif for twigs
+        }                                                       //   End loop over cells
+        sendBodyRanks.push_back(irank);                         //   Add current rank to sendBodyRanks
+        sendBodyCellCnt.push_back(sendBodyCells.size()-oldsize);//   Add current cell count to sendBodyCellCnt
+        oldsize = sendBodyCells.size();                         //   Set new offset for cell count
+      }                                                         //  Endif for neighbor ranks
+    }                                                           // End loop over ranks
+  }
+
+//! Get size of data to send
+  void getSendCount(bool comm=true) {
+    int ic = 0, ssize = 0;                                      // Initialize counter and offset for scells
+    sendBodyCnt.assign(MPISIZE,0);                              // Initialize send count
+    sendBodyDsp.assign(MPISIZE,0);                              // Initialize send displacement
+    for( int i=0; i!=int(sendBodyRanks.size()); ++i ) {         // Loop over ranks to send to & recv from
+      int irank = sendBodyRanks[i];                             //  Rank to send to & recv from
+      for( int c=0; c!=sendBodyCellCnt[i]; ++c,++ic ) {         //  Loop over cells to send to that rank
+        C_iter C = sendBodyCells[ic];                           //   Set cell iterator
+        for( B_iter B=C->LEAF; B!=C->LEAF+C->NDLEAF; ++B ) {    //   Loop over bodies in that cell
+          JBody body;                                           //    Set compact body type for sending
+          body.ICELL = B->ICELL;                                //    Set cell index of compact body type
+          body.X     = B->X;                                    //    Set position of compact body type
+          body.SRC   = B->SRC;                                  //    Set source values of compact body type
+          sendBodies.push_back(body);                           //    Push it into the send buffer
+        }                                                       //   End loop over bodies
+      }                                                         //  End loop over cells
+      sendBodyCnt[irank] = sendBodies.size()-ssize;             //  Set send count of current rank
+      sendBodyDsp[irank] = ssize;                               //  Set send displacement of current rank
+      ssize += sendBodyCnt[irank];                              //  Increment offset for vector scells
+    }                                                           // End loop over ranks
+    if( comm ) {                                                // If communication is necessary
+      MPI_Alltoall(&sendBodyCnt[0],1,MPI_INT,&recvBodyCnt[0],1,MPI_INT,MPI_COMM_WORLD);// Communicate the send counts
+      int rsize = 0;                                            // Initialize total recv count
+      for( int i=0; i!=MPISIZE; ++i ) {                         // Loop over ranks to recv from
+        recvBodyDsp[i] = rsize;                                 //  Set recv displacements
+        rsize += recvBodyCnt[i];                                //  Accumulate recv counts
+      }                                                         // End loop over ranks to recv from
+      recvBodies.resize(rsize);                                 // Resize recv buffer
+    }
+  }
+
+//! Communicate cells by one-to-one MPI_Alltoallv
+  void commBodiesAlltoall() {
+    assert(isPowerOfTwo(MPISIZE));                              // Make sure the number of processes is a power of two
+    int bytes = sizeof(sendBodies[0]);                          // Byte size of jbody structure
+    int *scntd = new int [MPISIZE];                             // Permuted send count
+    int *rcntd = new int [MPISIZE];                             // Permuted recv count
+    int *rdspd = new int [MPISIZE];                             // Permuted recv displacement
+    int *irev  = new int [MPISIZE];                             // Map original to compressed index
+    JBodies sendBuffer = sendBodies;                            // Send buffer
+    JBodies recvBuffer;                                         // Recv buffer
+    for( int l=0; l!=LEVEL; ++l ) {                             // Loop over levels of N-D hypercube communication
+      int npart = 1 << (LEVEL - l - 1);                         // Size of partition block
+      int scnt2[2], sdsp2[2], rcnt2[2], rdsp2[2];               // Send/recv counts/displacements per level
+      int ic = 0;                                               // Initialize counter
+      for( int i=0; i!=2; ++i ) {                               // Loop over the two blocks
+        scnt2[i] = 0;                                           //  Initialize send count per level
+        for( int irank=0; irank!=MPISIZE/2; ++irank ) {         //  Loop over ranks in each block
+          int idata = (irank / npart) * 2 * npart + irank % npart + i * npart;// Original index
+          int isend = i * MPISIZE / 2 + irank;                  //   Compressed index
+          irev[idata] = isend;                                  //   Map original to compressed index
+          scntd[isend] = sendBodyCnt[idata];                    //   Permuted send count
+          scnt2[i] += sendBodyCnt[idata] * bytes;               //   Send count per block
+          for( int id=sendBodyDsp[idata]; id!=sendBodyDsp[idata]+sendBodyCnt[idata]; ++id,++ic ) {// Loop over bodies
+            sendBuffer[ic] = sendBodies[id];                    //    Fill send buffer
+          }                                                     //   End loop over bodies
+        }                                                       //  End loop over ranks in each block
+      }                                                         // End loop over blocks
+      MPI_Alltoall(scntd,MPISIZE/2,MPI_INT,rcntd,MPISIZE/2,MPI_INT,MPI_COMM[l+1][2]);// Comm permuted count
+      MPI_Alltoall(scnt2,1,MPI_INT,rcnt2,1,MPI_INT,MPI_COMM[l+1][2]);// Comm send count per block
+      sdsp2[0] = 0; sdsp2[1] = scnt2[0];                        // Set send displacement
+      rdsp2[0] = 0; rdsp2[1] = rcnt2[0];                        // Set recv displacement
+      int rsize = (rdsp2[1] + rcnt2[1]) / bytes;                // Size of recieved bodies
+      sendBodies.resize(rsize);                                 // Resize send bodies
+      sendBuffer.resize(rsize);                                 // Resize send buffer
+      recvBuffer.resize(rsize);                                 // Resize recv buffer
+      MPI_Alltoallv(&sendBuffer[0],scnt2,sdsp2,MPI_BYTE,        // Communicate cells
+                    &recvBuffer[0],rcnt2,rdsp2,MPI_BYTE,MPI_COMM[l+1][2]);// MPI_COMM[2] is for the one-to-one pair
+      rdspd[0] = 0;                                             // Initialize permuted recv displacement
+      for( int irank=0; irank!=MPISIZE-1; ++irank ) {           // Loop over ranks
+        rdspd[irank+1] = rdspd[irank] + rcntd[irank];           //  Set permuted recv displacement
+      }                                                         // End loop over ranks
+      ic = 0;                                                   // Initiaize counter
+      for( int i=0; i!=2; ++i ) {                               // Loop over the two blocks
+        for( int irank=0; irank!=MPISIZE/2; ++irank ) {         //  Loop over ranks in each block
+          int idata = (irank / npart) * 2 * npart + irank % npart + i * npart;// Original index
+          int irecv = i * MPISIZE / 2 + irank;                  //   Compressed index
+          recvBodyCnt[idata] = rcntd[irecv];                    //   Set recv cound
+          idata = irev[irecv];                                  //   Set premuted index
+          for( int id=rdspd[idata]; id!=rdspd[idata]+rcntd[idata]; ++id,++ic ) {// Loop over bodies
+            sendBodies[ic] = recvBuffer[id];                    //    Get data from recv buffer
+          }                                                     //   End loop over bodies
+        }                                                       //  End loop over ranks in each block
+      }                                                         // End loop over blocks
+      recvBodyDsp[0] = 0;                                       // Initialize recv displacement
+      for( int irank=0; irank!=MPISIZE-1; ++irank ) {           // Loop over ranks
+        recvBodyDsp[irank+1] = recvBodyDsp[irank] + recvBodyCnt[irank];//  Set recv displacement
+      }                                                         // End loop over ranks
+      for( int irank=0; irank!=MPISIZE; ++irank ) {             // Loop over ranks
+        sendBodyCnt[irank] = recvBodyCnt[irank];                //  Get next send count
+        sendBodyDsp[irank] = recvBodyDsp[irank];                //  Get next send displacement
+      }                                                         // End loop over ranks
+    }                                                           // End loop over levels of N-D hypercube communication
+    recvBodies = sendBodies;                                    // Copy send bodies to recv bodies
+    delete[] scntd;                                             // Delete permuted send count
+    delete[] rcntd;                                             // Delete permuted recv count
+    delete[] rdspd;                                             // Delete permuted recv displacement
+    delete[] irev;                                              // Delete map from original to compressed index
+  }
+
+//! Get boundries of domains on other processes
+  void getOtherDomain(vect &xmin, vect &xmax, int l) {
+    startTimer("Get domain");                                   // Start timer
+    MPI_Datatype MPI_TYPE = getType(XMIN[l][0]);                // Get MPI data type
+    vect send[2],recv[2];                                       // Send and recv buffer
+    MPI_Request req;                                            // MPI requests
+    send[0] = send[1] = XMIN[l];                                // Set XMIN into send buffer
+    recv[0] = recv[1] = 0;                                      // Initialize recv buffer
+    MPI_Alltoall(send,3,MPI_TYPE,recv,3,MPI_TYPE,MPI_COMM[l][2]);// Communicate XMIN
+    xmin = recv[1-key[l][2]];                                   // Copy xmin from recv buffer
+    send[0] = send[1] = XMAX[l];                                // Set XMAX into send buffer
+    recv[0] = recv[1] = 0;                                      // Initialize recv buffer
+    MPI_Alltoall(send,3,MPI_TYPE,recv,3,MPI_TYPE,MPI_COMM[l][2]);// Communicate XMAX
+    xmax = recv[1-key[l][2]];                                   // Copy xmax from recv buffer
+    if( nprocs[l-1][0] % 2 == 1 && nprocs[l][0] >= nprocs[l][1] ) {// If right half of odd group
+      int isend = (key[l][0] + 1            ) % nprocs[l][0];   //  Send to next rank (wrapped)
+      int irecv = (key[l][0] - 1 + nprocs[l][0]) % nprocs[l][0];//  Recv from previous rank (wrapped)
+      send[0] = xmin;                                           //  Set xmin in send buffer
+      MPI_Isend(send,3,MPI_TYPE,isend,0,MPI_COMM[l][0],&req);   //  Send to next rank
+      MPI_Irecv(recv,3,MPI_TYPE,irecv,0,MPI_COMM[l][0],&req);   //  Recv from previous rank
+      MPI_Wait(&req,MPI_STATUS_IGNORE);                         //  Wait for recv to finish
+      if( color[l][0] != color[l][1] ) xmin = recv[0];          //  Update only if leftover process of odd group
+      send[0] = xmax;                                           //  Set xmax in send buffer
+      MPI_Isend(send,3,MPI_TYPE,isend,0,MPI_COMM[l][0],&req);   //  Send to next rank
+      MPI_Irecv(recv,3,MPI_TYPE,irecv,0,MPI_COMM[l][0],&req);   //  Recv from previous rank
+      MPI_Wait(&req,MPI_STATUS_IGNORE);                         //  Wait for recv to finish
+      if( color[l][0] != color[l][1] ) xmax = recv[0];          //  Update only if leftover process of odd group
+    }                                                           // Endif for right half of odd group
+    if( nprocs[l-1][0] == 1 ) {                                 // If previous group had one process
+      xmin = XMIN[l];                                           //  Use own XMIN value for xmin
+      xmax = XMAX[l];                                           //  Use own XMAX value for xmax
+    }                                                           // Endif for isolated process
+    stopTimer("Get domain",printNow);                           // Stop timer
+  }
+
+//! Get disatnce to other domain
+  real getDistance(C_iter C, vect xmin, vect xmax) {
+    vect dist;                                                  // Distance vector
+    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
+      dist[d] = (C->X[d] + Xperiodic[d] > xmax[d])*             //  Calculate the distance between cell C and
+                (C->X[d] + Xperiodic[d] - xmax[d])+             //  the nearest point in domain [xmin,xmax]^3
+                (C->X[d] + Xperiodic[d] < xmin[d])*             //  Take the differnece from xmin or xmax
+                (C->X[d] + Xperiodic[d] - xmin[d]);             //  or 0 if between xmin and xmax
+    }                                                           // End loop over dimensions
+    real R = std::sqrt(norm(dist));                             // Scalar distance
+    return R;
+  }
+
+//! Determine which cells to send
+  void getLET(C_iter C0, C_iter C, vect xmin, vect xmax) {
+    int level = int(log(MPISIZE-1) / M_LN2 / 3) + 1;            // Level of local root cell
+    if( MPISIZE == 1 ) level = 0;                               // Account for serial case
+    for( int i=0; i!=C->NCHILD; i++ ) {                         // Loop over child cells
+      C_iter CC = C0+C->CHILD+i;                                //  Iterator for child cell
+      bool divide = false;                                      //  Initialize logical for dividing
+      if( IMAGES == 0 ) {                                       //  If free boundary condition
+        Xperiodic = 0;                                          //   Set periodic coordinate offset
+        real R = getDistance(CC,xmin,xmax);                     //   Get distance to other domain
+        divide |= CLET * CC->R > THETA * R - EPS2;              //   If the cell seems too close and not twig
+      } else {                                                  //  If periodic boundary condition
+        for( int ix=-1; ix<=1; ++ix ) {                         //   Loop over x periodic direction
+          for( int iy=-1; iy<=1; ++iy ) {                       //    Loop over y periodic direction
+            for( int iz=-1; iz<=1; ++iz ) {                     //     Loop over z periodic direction
+              Xperiodic[0] = ix * 2 * R0;                       //      Coordinate offset for x periodic direction
+              Xperiodic[1] = iy * 2 * R0;                       //      Coordinate offset for y periodic direction
+              Xperiodic[2] = iz * 2 * R0;                       //      Coordinate offset for z periodic direction
+              real R = getDistance(CC,xmin,xmax);               //      Get distance to other domain
+              divide |= CLET * CC->R > THETA * R - EPS2;        //      If the cell seems too close and not twig
+            }                                                   //     End loop over z periodic direction
+          }                                                     //    End loop over y periodic direction
+        }                                                       //   End loop over x periodic direction
+      }                                                         //  Endif for periodic boundary condition
+      divide |= R0 / (1 << level) + 1e-5 < CC->R;               //  If the cell is larger than the local root cell
+      if( divide && CC->NCHILD != 0 ) {                         //  If the cell seems too close and not twig
+        getLET(C0,CC,xmin,xmax);                                //   Traverse the tree further
+      } else {                                                  //  If the cell is far or a twig
+        assert( R0 / (1 << level) + 1e-5 > CC->R );             //   Can't send cells that are larger than local root
+        JCell cell;                                             //   Set compact cell type for sending
+        cell.ICELL = CC->ICELL;                                 //   Set index of compact cell type
+        cell.M     = CC->M;                                     //   Set Multipoles of compact cell type
+        sendCells.push_back(cell);                              //    Push cell into send buffer vector
+      }                                                         //  Endif for interaction
+    }                                                           // End loop over child cells
+    if( C->ICELL == 0 && C->NCHILD == 0 ) {                     // If the root cell has no children
+      JCell cell;                                               //  Set compact cell type for sending
+      cell.ICELL = C->ICELL;                                    //  Set index of compact cell type
+      cell.M     = C->M;                                        //  Set Multipoles of compact cell type
+      sendCells.push_back(cell);                                //  Push cell into send buffer vector
+    }                                                           // Endif for root cells children
+  }
+
+//! Communicate cells by one-to-one MPI_Alltoallv
+  void commCellsAlltoall(int l) {
+    const int bytes = sizeof(sendCells[0]);                     // Byte size of JCell structure
+    int rcnt[2], scnt[2] = {0, 0};                              // Recv count, send count
+    scnt[1-key[l+1][2]] = sendCells.size()*bytes;               // Set send count to size of send buffer * bytes
+    MPI_Alltoall(scnt,1,MPI_INT,rcnt,1,MPI_INT,MPI_COMM[l+1][2]);// Communicate the send count to get recv count
+    int sdsp[2] = {0, scnt[0]};                                 // Send displacement
+    int rdsp[2] = {0, rcnt[0]};                                 // Recv displacement
+    if( color[l+1][0] != color[l+1][1] ) {                      // If leftover process of odd group
+      rcnt[1-key[l+1][2]] = 0;                                  //  It won't have a pair for this communication
+    }                                                           // Endif for leftover process of odd group
+    recvCells.resize(rcnt[1-key[l+1][2]]/bytes);                // Resize recv buffer
+    MPI_Alltoallv(&sendCells[0],scnt,sdsp,MPI_BYTE,             // Communicate cells
+                  &recvCells[0],rcnt,rdsp,MPI_BYTE,MPI_COMM[l+1][2]);// MPI_COMM[2] is for the one-to-one pair
+  }
+
+//! Communicate cells by scattering from leftover processes
+  void commCellsScatter(int l) {
+    const int bytes = sizeof(sendCells[0]);                     // Byte size of JCell structure
+    int numScatter = nprocs[l+1][1] - 1;                        // Number of processes to scatter to
+    int oldSize = recvCells.size();                             // Size of recv buffer before communication
+    int *scnt = new int [nprocs[l+1][1]];                       // Send count
+    int *sdsp = new int [nprocs[l+1][1]];                       // Send displacement
+    int rcnt;                                                   // Recv count
+    if( key[l+1][1] == numScatter ) {                           // If this is the leftover proc to scatter from
+      sdsp[0] = 0;                                              //  Initialize send displacement
+      for(int i=0; i!=numScatter; ++i ) {                       //  Loop over processes to send to
+        int begin = 0, end = sendCells.size();                  //   Set begin and end of range to send
+        splitRange(begin,end,i,numScatter);                     //   Split range into numScatter and get i-th range
+        scnt[i] = end - begin;                                  //   Set send count based on range
+        sdsp[i+1] = sdsp[i] + scnt[i];                          //   Set send displacement based on send count
+      }                                                         //  End loop over processes to send to
+      scnt[numScatter] = 0;                                     //  Send count to self should be 0
+    }                                                           // Endif for leftover proc
+    MPI_Scatter(scnt,1,MPI_INT,&rcnt,1,MPI_INT,numScatter,MPI_COMM[l+1][1]);// Scatter the send count to get recv count
+
+    recvCells.resize(oldSize+rcnt);                             // Resize recv buffer based on recv count
+    for(int i=0; i!= nprocs[l+1][1]; ++i ) {                    // Loop over group of processes
+      scnt[i] *= bytes;                                         //  Multiply send count by byte size of data
+      sdsp[i] *= bytes;                                         //  Multiply send displacement by byte size of data
+    }                                                           // End loop over group of processes
+    rcnt *= bytes;                                              // Multiply recv count by byte size of data
+    MPI_Scatterv(&sendCells[0],      scnt,sdsp,MPI_BYTE,        // Communicate cells via MPI_Scatterv
+                 &recvCells[oldSize],rcnt,     MPI_BYTE,        // Offset recv buffer by oldSize
+                 numScatter,MPI_COMM[l+1][1]);
+    delete[] scnt;                                              // Delete send count
+    delete[] sdsp;                                              // Delete send displacement
+  }
+
+//! Turn recv bodies to twigs
+  void rbodies2twigs(Bodies &bodies, Cells &twigs) {
+    startTimer("Recv bodies");                                  //  Start timer
+    for( JB_iter JB=recvBodies.begin(); JB!=recvBodies.end(); ++JB ) {// Loop over recv bodies
+      Body body;                                                //  Body structure
+      body.IBODY = 0;                                           //  Initialize body index
+      body.IPROC = 0;                                           //  Initialize proc index
+      body.TRG   = 0;                                           //  Initialize target values
+      body.ICELL = JB->ICELL;                                   //  Set index of cell
+      body.X     = JB->X;                                       //  Set position of body
+      body.SRC   = JB->SRC;                                     //  Set source values of body
+      bodies.push_back(body);                                   //  Push body into bodies vector
+    }                                                           // End loop over recv bodies
+    buffer.resize(bodies.size());                               // Resize sort buffer
+    stopTimer("Recv bodies",printNow);                          //  Stop timer
+    sortBodies(bodies,buffer,false);                            // Sort bodies in descending order
+    bodies2twigs(bodies,twigs);                                 // Turn bodies to twigs
+  }
+
+//! Turn cells to twigs
+  void cells2twigs(Cells &cells, Cells &twigs, bool last) {
+    while( !cells.empty() ) {                                   // While cell vector is not empty
+      if( cells.back().NCHILD == 0 ) {                          //  If cell has no child
+        if( cells.back().NDLEAF == 0 || !last ) {               //   If cell has no leaf or is not last iteration
+          cells.back().NDLEAF = 0;                              //    Set number of leafs to 0
+          twigs.push_back(cells.back());                        //    Push cell into twig vector
+        }                                                       //   Endif for no leaf
+      }                                                         //  Endif for no child
+      cells.pop_back();                                         //  Pop last element from cell vector
+    }                                                           // End while for cell vector
+  }
+
+//! Turn send buffer to twigs
+  void send2twigs(Bodies &bodies, Cells &twigs, int offTwigs) {
+    for( JC_iter JC=sendCells.begin(); JC!=sendCells.begin()+offTwigs; ++JC ) {// Loop over send buffer
+      Cell cell;                                                //  Cell structure
+      cell.ICELL = JC->ICELL;                                   //  Set index of cell
+      cell.M     = JC->M;                                       //  Set multipole of cell
+      cell.CHILD = cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0; //  Set number of leafs and children
+      cell.LEAF  = bodies.end();                                //  Set pointer to first leaf
+      getCenter(cell);                                          //  Set center and radius
+      twigs.push_back(cell);                                    //  Push cell into twig vector
+    }                                                           // End loop over send buffer
+    sendCells.clear();                                          // Clear send buffer
+  }
+
+//! Turn recv buffer to twigs
+  void recv2twigs(Bodies &bodies, Cells &twigs) {
+    for( JC_iter JC=recvCells.begin(); JC!=recvCells.end(); ++JC ) {// Loop over recv buffer
+      Cell cell;                                                //  Cell structure
+      cell.ICELL = JC->ICELL;                                   //  Set index of cell
+      cell.M     = JC->M;                                       //  Set multipole of cell
+      cell.CHILD = cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0; //  Set number of leafs and children
+      cell.LEAF  = bodies.end();                                //  Set pointer to first leaf
+      getCenter(cell);                                          //  Set center and radius
+      twigs.push_back(cell);                                    //  Push cell into twig vector
+    }                                                           // End loop over recv buffer
+  }
+
+//! Zip two groups of twigs that overlap
+  void zipTwigs(Cells &twigs, Cells &cells, Cells &sticks, bool last) {
+    startTimer("Sort resize");                                  // Start timer
+    Cells cbuffer = twigs;                                      // Sort buffer for cells
+    stopTimer("Sort resize",printNow);                          // Stop timer
+    sortCells(twigs,cbuffer);                                   // Sort twigs in ascending order
+    startTimer("Ziptwigs");                                     // Start timer
+    bigint index = 1e8;                                         // Initialize index counter
+    while( !twigs.empty() ) {                                   // While twig vector is not empty
+      if( twigs.back().ICELL != index ) {                       //  If twig's index is different from previous
+        cells.push_back(twigs.back());                          //   Push twig into cell vector
+        index = twigs.back().ICELL;                             //   Update index counter
+      } else if ( twigs.back().NDLEAF == 0 || !last ) {         //  Elseif twig-twig collision
+        cells.back().M += twigs.back().M;                       //   Accumulate the multipole
+      } else if ( cells.back().NDLEAF == 0 ) {                  //  Elseif twig-body collision
+        Mset M;                                                 //   Multipole for temporary storage
+        M = cells.back().M;                                     //   Save multipoles from cells
+        cells.back() = twigs.back();                            //   Copy twigs to cells
+        cells.back().M = M;                                     //   Copy back multipoles to cells
+        twigs.back().M = M - twigs.back().M;                    //   Take the difference of the two
+        if( std::abs(twigs.back().M[0]/M[0]) > EPS ) {          //   If the difference is non-zero
+          sticks.push_back(twigs.back());                       //    Save this difference in the sticks vector
+        }                                                       //   Endif for non-zero difference
+      } else {                                                  //  Else body-body collision (don't do anything)
+      }                                                         //  Endif for collision type
+      twigs.pop_back();                                         //  Pop last element from twig vector
+    }                                                           // End while for twig vector
+    stopTimer("Ziptwigs",printNow);                             // Stop timer
+    sortCells(cells,cbuffer);                                   // Sort cells in ascending order
+    startTimer("Ziptwigs");                                     // Start timer
+    twigs = cells;                                              // Copy cells to twigs
+    cells.clear();                                              // Clear cells
+    stopTimer("Ziptwigs",printNow);                             // Stop timer
+  }
+
+//! Re-index bodies
+  void reindexBodies(Bodies &bodies, Cells &twigs, Cells &cells ,Cells &sticks) {
+    startTimer("Reindex");                                      // Start timer
+    while( !twigs.empty() ) {                                   // While twig vector is not empty
+      if( twigs.back().NDLEAF == 0 ) {                          //  If twig has no leafs
+        cells.push_back(twigs.back());                          //   Push twig into cell vector
+      }                                                         //  Endif for no leafs
+      twigs.pop_back();                                         //  Pop last element from twig vector
+    }                                                           // End while for twig vector
+//    BottomUp::setIndex(bodies,-1,0,0,true);                     // Set index of bodies
+    buffer.resize(bodies.size());                               // Resize sort buffer
+    stopTimer("Reindex",printNow);                              // Stop timer
+//    sortBodies(bodies,buffer,false);                            // Sort bodies in descending order
+//    BottomUp::grow(bodies);                                     // Grow tree structure
+    sortBodies(bodies,buffer,false);                            // Sort bodies in descending order
+    bodies2twigs(bodies,twigs);                                 // Turn bodies to twigs
+    startTimer("Reindex");                                      // Start timer
+    for( C_iter C=twigs.begin(); C!=twigs.end(); ++C ) {        // Loop over cells
+      if( sticks.size() > 0 ) {                                 //  If stick vector is not empty
+        if( C->ICELL == sticks.back().ICELL ) {                 //   If twig's index is equal to stick's index
+          C->M += sticks.back().M;                              //    Accumulate multipole
+          sticks.pop_back();                                    //    Pop last element from stick vector
+        }                                                       //   Endif for twig's index
+      }                                                         //  Endif for stick vector
+    }                                                           // End loop over cells
+    cells.insert(cells.begin(),twigs.begin(),twigs.end());      // Add twigs to the end of cell vector
+    cells.insert(cells.begin(),sticks.begin(),sticks.end());    // Add remaining sticks to the end of cell vector
+    sticks.clear();                                             // Clear sticks
+    Cells cbuffer = cells;                                      // Sort buffer for cells
+    stopTimer("Reindex",printNow);                              // Stop timer
+    sortCells(cells,cbuffer);                                   // Sort cells in ascending order
+    startTimer("Reindex");                                      // Start timer
+    twigs = cells;                                              // Copy cells to twigs
+    cells.clear();                                              // Clear cells
+    stopTimer("Reindex",printNow);                              // Stop timer
+  }
+
+//! Turn sticks to send buffer
+  void sticks2send(Cells &sticks, int &offTwigs) {
+    while( !sticks.empty() ) {                                  // While stick vector is not empty
+      JCell cell;                                               //  Cell structure
+      cell.ICELL = sticks.back().ICELL;                         //  Set index of cell
+      cell.M     = sticks.back().M;                             //  Set multipole of cell
+      sendCells.push_back(cell);                                //  Push cell into send buffer
+      sticks.pop_back();                                        //  Pop last element of stick vector
+    }                                                           // End while for stick vector
+    offTwigs = sendCells.size();                                // Keep track of current send buffer size
+  }
+
+//! Validate number of send cells
+  void checkNumCells(int l) {                                   // Only works with octsection
+    int maxLevel = int(log(MPISIZE-1) / M_LN2 / 3) + 1;
+    if( MPISIZE == 1 ) maxLevel = 0;
+    int octant0 = -1;
+    int numCells = 0;
+    for( JC_iter JC=sendCells.begin(); JC!=sendCells.end(); ++JC ) {
+      int level = getLevel(JC->ICELL);
+      int index = JC->ICELL - ((1 << 3*level) - 1) / 7;
+      int octant = index / (1 << 3 * (level - maxLevel));
+      if( octant != octant0 ) {
+        octant0 = octant;
+        numCells++;
+      }
+    }
+    int numCellsExpect = (1 << (3 * maxLevel - 1)) / (1 << l);  // Isn't true for far domains
+    if( numCellsExpect != numCells && MPIRANK == 0) std::cout << numCells << " " << numCellsExpect << std::endl;
+  }
+
+//! Check total charge
+  void checkSumMass(Cells &cells) {
+    real localMass = 0;
+    for( C_iter C=cells.begin(); C!=cells.end(); ++C ) {
+      if( C->NCHILD == 0 ) {
+        localMass += std::abs(C->M[0]);
+      }
+    }
+    real globalMass;
+    MPI_Allreduce(&localMass,&globalMass,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+    print("localMass : ",0);
+    print(localMass);
+    print("globalMass : ",0);
+    print(globalMass,0);
+    print("\n",0);
+  }
+
+public:
+//! Set bodies to communicate
+  void setCommBodies(Cells &cells) {
+    startTimer("Gather bounds");                                // Start timer
+    gatherBounds();                                             // Gather bounds of other domain
+    stopTimer("Gather bounds",printNow);                        // Stop timer
+    startTimer("Get send rank");                                // Start timer
+    getSendRank(cells);                                         // Get neighbor ranks to send to
+    stopTimer("Get send rank",printNow);                        // Stop timer
+  }
+
+//! Update bodies using the previous send count
+  void updateBodies(bool comm=true) {
+    startTimer("Get send cnt");                                 // Start timer
+    getSendCount(comm);                                         // Get size of data to send
+    stopTimer("Get send cnt",printNow);                         // Stop timer
+    startTimer("Alltoall B");                                   // Start timer
+#if 1
+    int bytes = sizeof(sendBodies[0]);                          // Byte size of jbody structure
+    for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
+      sendBodyCnt[i] *= bytes;                                  //  Multiply by bytes
+      sendBodyDsp[i] *= bytes;                                  //  Multiply by bytes
+      recvBodyCnt[i] *= bytes;                                  //  Multiply by bytes
+      recvBodyDsp[i] *= bytes;                                  //  Multiply by bytes
+      if(printNow&&i!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << i << " : " << sendBodyCnt[i] << " Bytes for P2P" << std::endl;
+    }                                                           // End loop over ranks
+    MPI_Alltoallv(&sendBodies[0],&sendBodyCnt[0],&sendBodyDsp[0],MPI_BYTE,
+                  &recvBodies[0],&recvBodyCnt[0],&recvBodyDsp[0],MPI_BYTE,MPI_COMM_WORLD);
+    for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
+      sendBodyCnt[i] /= bytes;                                  //  Divide by bytes
+      sendBodyDsp[i] /= bytes;                                  //  Divide by bytes
+      recvBodyCnt[i] /= bytes;                                  //  Divide by bytes
+      recvBodyDsp[i] /= bytes;                                  //  Divide by bytes
+    }                                                           // End loop over ranks
+#else
+    commBodiesAlltoall();
+#endif
+    sendBodies.clear();                                         // Clear send buffer for bodies
+    stopTimer("Alltoall B",printNow);                           // Stop timer
+  }
+
+//! Communicate bodies in the local essential tree
+  void commBodies(Cells &cells) {
+    setCommBodies(cells);                                       // Set bodies to communicate
+    updateBodies();                                             // Update bodies with alltoall
+  }
+
+//! Convert recvBodies to cells
+  void bodies2cells(Bodies &bodies, Cells &cells) {
+    Cells twigs,sticks;                                         // Twigs and sticks are special types of cells
+    rbodies2twigs(bodies,twigs);                                // Put recv bodies into twig vector
+    twigs2cells(twigs,cells,sticks);                            // Turn twigs to cells
+  }
+
+//! Communicate cells in the local essential tree
+  void commCells(Bodies &bodies, Cells &cells) {
+    vect xmin = 0, xmax = 0;                                    // Initialize domain boundaries
+    Cells twigs,sticks;                                         // Twigs and sticks are special types of cells
+
+#if 1
+    startTimer("Get LET");                                      // Start timer
+    int ssize = 0;                                              // Initialize offset for send cells
+    sendCellCnt.assign(MPISIZE,0);                              // Initialize cell send count
+    sendCellDsp.assign(MPISIZE,0);                              // Initialize cell send displacement
+    for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks to send to
+      getLET(cells.begin(),cells.end()-1,xminAll[irank],xmaxAll[irank]);//  Determine which cells to send
+      sendCellCnt[irank] = sendCells.size()-ssize;              //  Set cell send count of current rank
+      sendCellDsp[irank] = ssize;                               //  Set cell send displacement of current rank
+      ssize += sendCellCnt[irank];                              //  Increment offset for vector send cells
+      if( printNow ) {
+        int sendSize[10] = {0,0,0,0,0,0,0,0,0,0};
+        for( JC_iter JC=sendCells.begin(); JC!=sendCells.end(); ++JC ) {
+          int level = getLevel(JC->ICELL);
+          if(sendCellDsp[irank] <= JC-sendCells.begin() && JC-sendCells.begin() < ssize )
+            sendSize[level] += sizeof(sendCells[0]);
+        }
+        for( int l=0; l<10; l++ ) {
+          if(irank!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << irank << " : " << sendSize[l] << " Bytes @ level " << l << std::endl;
+        }
+      }
+    }                                                           // End loop over ranks
+    stopTimer("Get LET",printNow);                              // Stop timer
+    startTimer("Alltoall C");                                   // Start timer
+    MPI_Alltoall(&sendCellCnt[0],1,MPI_INT,&recvCellCnt[0],1,MPI_INT,MPI_COMM_WORLD);// Communicate the send counts
+    int rsize = 0;                                              // Initialize total recv count
+    for( int irank=0; irank!=MPISIZE; ++irank ) {               // Loop over ranks to recv from
+      recvCellDsp[irank] = rsize;                               //  Set recv displacements
+      rsize += recvCellCnt[irank];                              //  Accumulate recv counts
+    }                                                           // End loop over ranks to recv from
+    recvCells.resize(rsize);                                    // Resize recv buffer
+    int bytes = sizeof(sendCells[0]);                           // Byte size of jbody structure
+    for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
+      sendCellCnt[i] *= bytes;                                  //  Multiply by bytes
+      sendCellDsp[i] *= bytes;                                  //  Multiply by bytes
+      recvCellCnt[i] *= bytes;                                  //  Multiply by bytes
+      recvCellDsp[i] *= bytes;                                  //  Multiply by bytes
+      if(printNow&&i!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << i << " : " << sendCellCnt[i] << " Bytes for M2L total" << std::endl;
+    }                                                           // End loop over ranks
+    MPI_Alltoallv(&sendCells[0],&sendCellCnt[0],&sendCellDsp[0],MPI_BYTE,
+                  &recvCells[0],&recvCellCnt[0],&recvCellDsp[0],MPI_BYTE,MPI_COMM_WORLD);
+    for( int i=0; i!=MPISIZE; ++i ) {                           // Loop over ranks
+      sendCellCnt[i] /= bytes;                                  //  Divide by bytes
+      sendCellDsp[i] /= bytes;                                  //  Divide by bytes
+      recvCellCnt[i] /= bytes;                                  //  Divide by bytes
+      recvCellDsp[i] /= bytes;                                  //  Divide by bytes
+    }                                                           // End loop over ranks
+    stopTimer("Alltoall C",printNow);                           // Stop timer
+    rbodies2twigs(bodies,twigs);                                // Put recv bodies into twig vector
+    startTimer("Cells2twigs");                                  // Start timer
+    cells2twigs(cells,twigs,true);                              // Put cells into twig vector
+    stopTimer("Cells2twigs",printNow);                          // Stop timer
+    startTimer("Recv2twigs");                                   // Start timer
+    recv2twigs(bodies,twigs);                                   // Put recv buffer into twig vector
+    stopTimer("Recv2twigs",printNow);                           // Stop timer
+    zipTwigs(twigs,cells,sticks,true);                          // Zip two groups of twigs that overlap
+    reindexBodies(bodies,twigs,cells,sticks);                   // Re-index bodies
+    twigs2cells(twigs,cells,sticks);                            // Turn twigs to cells
+    sendCells.clear();                                          // Clear send buffer
+    recvCells.clear();                                          // Clear recv buffer
+#else
+    int offTwigs = 0;                                           // Initialize offset of twigs
+    for( int l=0; l!=LEVEL; ++l ) {                             // Loop over levels of N-D hypercube communication
+      getOtherDomain(xmin,xmax,l+1);                            //  Get boundries of domains on other processes
+      startTimer("Get LET");                                    //  Start timer
+      getLET(cells.begin(),cells.end()-1,xmin,xmax);            //  Determine which cells to send
+#ifdef DEBUG
+      checkNumCells(LEVEL-l-1);
+      checkSumMass(cells);
+#endif
+      stopTimer("Get LET",printNow);                            //  Stop timer
+      startTimer("Alltoall C");                                 //  Start timer
+      commCellsAlltoall(l);                                     //  Communicate cells by one-to-one MPI_Alltoallv
+      if( nprocs[l][0] % 2 == 1 && nprocs[l][0] != 1 && nprocs[l+1][0] <= nprocs[l+1][1] ) {// If scatter is necessary
+        commCellsScatter(l);                                    //   Communicate cells by scattering from leftover proc
+      }                                                         //  Endif for odd number of procs
+      stopTimer("Alltoall C",printNow);                         //  Stop timer