Commits

Rio Yokota committed 5779782

Reverting today's changes.

  • Participants
  • Parent commits 6536d62

Comments (0)

Files changed (24)

File Makefile.include

 #DEVICE  = GPU
 
 ### choose Cartesian or spherical expansion
-EXPAND  = Cartesian
-#EXPAND  = Spherical
+#EXPAND  = Cartesian
+EXPAND  = Spherical
 
 ### GCC compiler
 CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 #CXX     += -I$(VTK_INCLUDE_PATH)
 #VFLAGS  = -L$(VTK_LIBRARY_PATH) -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO -DVTK
 
+ifeq ($(DEVICE),GPU)
 ### CUDA flags
-ifeq ($(DEVICE),GPU)
 LFLAGS  += -DQUEUE -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart -lstdc++ -ldl -lm
 endif
 

File include/bottomup.h

 */
 #ifndef bottomup_h
 #define bottomup_h
-#if COMPARE
 #include "topdown.h"
-#else
-#include "topdown2.h"
-#endif
 
 //! Bottomup tree constructor
 template<Equation equation>

File include/evaluator.h

 //! Approximate interaction between two cells
   inline void approximate(C_iter Ci, C_iter Cj) {
 #if HYBRID
-    if( timeP2P*Cj->NDBODY < timeM2P && timeP2P*Ci->NDBODY*Cj->NDBODY < timeM2L) {// If P2P is fastest
+    if( timeP2P*Cj->NDLEAF < timeM2P && timeP2P*Ci->NDLEAF*Cj->NDLEAF < timeM2L) {// If P2P is fastest
       evalP2P(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
-    } else if ( timeM2P < timeP2P*Cj->NDBODY && timeM2P*Ci->NDBODY < timeM2L ) {// If M2P is fastest
+    } else if ( timeM2P < timeP2P*Cj->NDLEAF && timeM2P*Ci->NDLEAF < timeM2L ) {// If M2P is fastest
       evalM2P(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
     } else {                                                    // If M2L is fastest
       evalM2L(Ci,Cj);                                           //  Evaluate on CPU, queue on GPU
       C = cellStack.top();                                      //  Get cell from top of stack
       cellStack.pop();                                          //  Pop traversal stack
       for( C_iter Cj=Cj0+C->CHILD; Cj!=Cj0+C->CHILD+C->NCHILD; ++Cj ) {// Loop over cell's children
-        vec3 dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
+        vect dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
         real Rq = std::sqrt(norm(dX));                          //   Scalar distance
         if( Rq * THETA < Ci->R + Cj->R && Cj->NCHILD == 0 ) {   //   If twigs are close
           evalEwaldReal(Ci,Cj);                                 //    Ewald real part
         }                                                       //   End loop over second cell's children
       }                                                         //  End if for which cell to split
 #if QUARK
-      if( int(pairQueue.size()) > root->NDBODY / 100 ) {        //  When queue size reaches threshold
+      if( int(pairQueue.size()) > root->NDLEAF / 100 ) {        //  When queue size reaches threshold
         while( !pairQueue.empty() ) {                           //   While dual traversal queue is not empty
           pair = pairQueue.front();                             //    Get interaction pair from front of queue
           pairQueue.pop_front();                                //    Pop dual traversal queue
 #endif
   }
 
-//! Split cell and call traverse() recursively for child
-  void splitCell(C_iter Ci, C_iter Cj) {
-    if (Cj->NCHILD == 0) {                                      // If Cj is leaf
-      assert(Ci->NCHILD > 0);                                   //  Make sure Ci is not leaf
-      for (C_iter ci=Ci0+Ci->CHILD; ci!=Ci0+Ci->CHILD+Ci->NCHILD; ci++ ) {// Loop over Ci's children
-        traverse(ci, Cj);                                       //   Traverse a single pair of cells
-      }                                                         //  End loop over Ci's children
-    } else if (Ci->NCHILD == 0) {                               // Else if Ci is leaf
-      assert(Cj->NCHILD > 0);                                   //  Make sure Cj is not leaf
-      for (C_iter cj=Cj0+Cj->CHILD; cj!=Cj0+Cj->CHILD+Cj->NCHILD; cj++ ) {// Loop over Cj's children
-        traverse(Ci, cj);                                       //   Traverse a single pair of cells
-      }                                                         //  End loop over Cj's children
-    } else if (Ci->RCRIT >= Cj->RCRIT) {                        // Else if Ci is larger than Cj
-      for (C_iter ci=Ci0+Ci->CHILD; ci!=Ci0+Ci->CHILD+Ci->NCHILD; ci++ ) {// Loop over Ci's children
-        traverse(ci, Cj);                                       //   Traverse a single pair of cells
-      }                                                         //  End loop over Ci's children
-    } else {                                                    // Else if Cj is larger than Ci
-      for (C_iter cj=Cj0+Cj->CHILD; cj!=Cj0+Cj->CHILD+Cj->NCHILD; cj++ ) {// Loop over Cj's children
-        traverse(Ci, cj);                                       //   Traverse a single pair of cells
-      }                                                         //  End loop over Cj's children
-    }                                                           // End if for leafs and Ci Cj size
-  }
-
-protected:
-//! Dual tree traversal for a single pair of cells
-  void traverse(C_iter Ci, C_iter Cj) {
-    vec3 dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
-    real R2 = norm(dX);                                         // Scalar distance squared
-    if (R2 > (Ci->RCRIT+Cj->RCRIT)*(Ci->RCRIT+Cj->RCRIT)) {   //  If distance is far enough
-      approximate(Ci, Cj);                                      //   Use approximate kernels
-    } else if (Ci->NCHILD == 0 && Cj->NCHILD == 0) {            //  Else if both cells are bodies
-      if (Cj->NCBODY == 0) {                                    //   If the bodies weren't sent from remote node
-        approximate(Ci, Cj);                                    //    Use approximate kernels
-      } else {                                                  //   Else if the bodies were sent
-        if (Ci == Cj) {                                         //    If source and target are same
-          P2P(Ci);                                              //     P2P kernel for single cell
-        } else {                                                //    Else if source and target are different
-          P2P(Ci, Cj);                                          //     P2P kernel for pair of cells
-        }                                                       //    End if for same source and target
-        NP2P++;                                                 //    Increment P2P counter
-      }                                                         //   End if for bodies
-    } else {                                                    //  Else if cells are close but not bodies
-      splitCell(Ci, Cj);                                        //   Split cell and call function recursively for child
-    }                                                           //  End if for multipole acceptance
-  }
-
 //! Get range of periodic images
   int getPeriodicRange() {
     int prange = 0;                                             //  Range of periodic images
     return prange;                                              // Return range of periodic images
   }
 
+protected:
 //! Get level from cell index
   int getLevel(bigint index) {
     int i = index;                                              // Copy to dummy index
                     cell.X[1]  = C->X[1] + (iy * 6 + cy * 2) * C->R;//     Set new y cooridnate for periodic image
                     cell.X[2]  = C->X[2] + (iz * 6 + cz * 2) * C->R;//     Set new z coordinate for periodic image
                     cell.M     = C->M;                          //         Copy multipoles to new periodic image
-                    cell.NCBODY = cell.NDBODY = cell.NCHILD = 0;//         Initialize NCBODY, NDBODY, & NCHILD
+                    cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0;//         Initialize NCLEAF, NDLEAF, & NCHILD
                     jcells.push_back(cell);                     //         Push cell into periodic jcell vector
                   }                                             //        End loop over z periodic direction (child)
                 }                                               //       End loop over y periodic direction (child)
 
 //! Use multipole acceptance criteria to determine whether to approximate, do P2P, or subdivide
   void interact(C_iter Ci, C_iter Cj, PairQueue &pairQueue) {
-    vec3 dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
+    vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
     real Rq = std::sqrt(norm(dX));                              // Scalar distance
     if( Rq * THETA > Ci->R + Cj->R ) {                          // If distance if far enough
       approximate(Ci,Cj);                                       //  Use approximate kernels, e.g. M2L, M2P
 
 //! Dual tree traversal
   void traverse(Cells &cells, Cells &jcells) {
-#if COMPARE
-    C_iter root = cells.begin();                                // Iterator for root target cell
-    C_iter jroot = jcells.begin();                              // Iterator for root source cell
-#else
     C_iter root = cells.end() - 1;                              // Iterator for root target cell
     C_iter jroot = jcells.end() - 1;                            // Iterator for root source cell
-#endif
     if( IMAGES != 0 ) {                                         // If periodic boundary condition
       jroot = jcells.end() - 1 - 26 * 27 * (IMAGES - 1);        //  The root is not at the end
     }                                                           // Endif for periodic boundary condition
     if( IMAGES == 0 ) {                                         // If free boundary condition
       Iperiodic = Icenter;                                      //  Set periodic image flag to center
       Xperiodic = 0;                                            //  Set periodic coordinate offset
-      Pair pair(root,jroot);                                    //     Form pair of root cells
-      traverse(root,jroot);                                     //  Traverse a pair of trees
+      Pair pair(root,jroot);                                    //  Form pair of root cells
+      traverseQueue(pair);                                      //  Traverse a pair of trees
     } else {                                                    // If periodic boundary condition
       int I = 0;                                                //  Initialize index of periodic image
       for( int ix=-1; ix<=1; ++ix ) {                           //  Loop over x periodic direction
             }                                                   //     End loop over z periodic direction
           }                                                     //    End loop over y periodic direction
         }                                                       //   End loop over x periodic direction
-        for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B) {  //   Loop over all leafs in cell
+        for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B) {  //   Loop over all leafs in cell
           B->TRG[0] -= M_2_SQRTPI * B->SRC * ALPHA;             //    Self term of Ewald real part
         }                                                       //   End loop over all leafs in cell
       }                                                         //  End if for twig cells

File include/kernel.h

   real *Anm;                                                    //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
   complex *Cnm;                                                 //!< M2L translation matrix \f$ C_{jn}^{km} \f$
 public:
-  vec3 X0;                                                      //!< Center of root cell
+  vect X0;                                                      //!< Center of root cell
   real R0;                                                      //!< Radius of root cell
 
 private:
-  real getBmax(vec3 const&X, C_iter C) const {
+  real getBmax(vect const&X, C_iter C) const {
     real rad = C->R;
     real dx = rad+std::abs(X[0]-C->X[0]);
     real dy = rad+std::abs(X[1]-C->X[1]);
 protected:
   void setCenter(C_iter C) const {
     real m = 0;
-    vec3 X = 0;
-    for( B_iter B=C->BODY; B!=C->BODY+C->NCBODY; ++B ) {
+    vect X = 0;
+    for( B_iter B=C->LEAF; B!=C->LEAF+C->NCLEAF; ++B ) {
       m += std::abs(B->SRC);
       X += B->X * std::abs(B->SRC);
     }
   KernelBase &operator=(const KernelBase) {return *this;}
 
 //! Set center of root cell
-  void setX0(vec3 x0) {X0 = x0;}
+  void setX0(vect x0) {X0 = x0;}
 //! Set radius of root cell
   void setR0(real r0) {R0 = r0;}
 
 //! Get center of root cell
-  vec3 getX0() const {return X0;}
+  vect getX0() const {return X0;}
 //! Get radius of root cell
   real getR0() const {return R0;}
 
 //! Set center and size of root cell
-  void setDomain(Bodies &bodies, vec3 x0=0, real r0=M_PI) {
-    vec3 xmin,xmax;                                             // Min,Max of domain
+  void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
+    vect xmin,xmax;                                             // Min,Max of domain
     B_iter B = bodies.begin();                                  // Reset body iterator
     xmin = xmax = B->X;                                         // Initialize xmin,xmax
     for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
         else if(B->X[d] > xmax[d]) xmax[d] = B->X[d];           //   Determine xmax
       }                                                         //  End loop over each dimension
     }                                                           // End loop over bodies
-    for (int d=0; d<3; d++) x0[d] = (xmax[d] + xmin[d]) / 2;    // Calculate center of domain
-    r0 = 0;                                                     // Initialize localRadius
-    for (int d=0; d<3; d++) {                                   // Loop over dimensions
-      r0 = std::max(x0[d] - xmin[d], r0);                       // Calculate min distance from center
-      r0 = std::max(xmax[d] - x0[d], r0);                       // Calculate max distance from center 
-    }                                                           // End loop over dimensions
-    r0 *= 1.00001;                                              // Add some leeway to radius
 /*
     if( xmin[0] < x0[0]-r0 || x0[0]+r0 < xmax[0]                //  Check for outliers in x direction
      || xmin[1] < x0[1]-r0 || x0[1]+r0 < xmax[1]                //  Check for outliers in y direction
   void M2M(C_iter Ci);                                          //!< Evaluate M2M kernel on CPU
   void M2L(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2L kernel on CPU
   void M2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate M2P kernel on CPU
-  void P2P(C_iter C) const;                                     //!< Evaluate P2P kernel on CPU
   void P2P(C_iter Ci, C_iter Cj) const;                         //!< Evaluate P2P kernel on CPU
   void L2L(C_iter Ci) const;                                    //!< Evaluate L2L kernel on CPU
   void L2P(C_iter Ci) const;                                    //!< Evaluate L2P kernel on CPU

File include/parallelfmm.h

   std::vector<int>    sendCellDsp;                              //!< Vector of cell send displacements
   std::vector<int>    recvCellCnt;                              //!< Vector of cell recv counts
   std::vector<int>    recvCellDsp;                              //!< Vector of cell recv displacements
-  std::vector<vec3>   xminAll;                                  //!< Buffer for gathering XMIN
-  std::vector<vec3>   xmaxAll;                                  //!< Buffer for gathering XMAX
+  std::vector<vect>   xminAll;                                  //!< Buffer for gathering XMIN
+  std::vector<vect>   xmaxAll;                                  //!< Buffer for gathering XMAX
 
   JBodies sendBodies;                                           //!< Send buffer for bodies
   JBodies recvBodies;                                           //!< Recv buffer for bodies
       int irank = sendBodyRanks[i];                             //  Rank to send to & recv from
       for( int c=0; c!=sendBodyCellCnt[i]; ++c,++ic ) {         //  Loop over cells to send to that rank
         C_iter C = sendBodyCells[ic];                           //   Set cell iterator
-        for( B_iter B=C->BODY; B!=C->BODY+C->NDBODY; ++B ) {    //   Loop over bodies in that cell
+        for( B_iter B=C->LEAF; B!=C->LEAF+C->NDLEAF; ++B ) {    //   Loop over bodies in that cell
           JBody body;                                           //    Set compact body type for sending
           body.ICELL = B->ICELL;                                //    Set cell index of compact body type
           body.X     = B->X;                                    //    Set position of compact body type
   }
 
 //! Get boundries of domains on other processes
-  void getOtherDomain(vec3 &xmin, vec3 &xmax, int l) {
+  void getOtherDomain(vect &xmin, vect &xmax, int l) {
     startTimer("Get domain");                                   // Start timer
     MPI_Datatype MPI_TYPE = getType(XMIN[l][0]);                // Get MPI data type
-    vec3 send[2],recv[2];                                       // Send and recv buffer
+    vect send[2],recv[2];                                       // Send and recv buffer
     MPI_Request req;                                            // MPI requests
     send[0] = send[1] = XMIN[l];                                // Set XMIN into send buffer
     recv[0] = recv[1] = 0;                                      // Initialize recv buffer
   }
 
 //! Get disatnce to other domain
-  real getDistance(C_iter C, vec3 xmin, vec3 xmax) {
-    vec3 dist;                                                  // Distance vector
+  real getDistance(C_iter C, vect xmin, vect xmax) {
+    vect dist;                                                  // Distance vector
     for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
       dist[d] = (C->X[d] + Xperiodic[d] > xmax[d])*             //  Calculate the distance between cell C and
                 (C->X[d] + Xperiodic[d] - xmax[d])+             //  the nearest point in domain [xmin,xmax]^3
   }
 
 //! Determine which cells to send
-  void getLET(C_iter C0, C_iter C, vec3 xmin, vec3 xmax) {
+  void getLET(C_iter C0, C_iter C, vect xmin, vect xmax) {
     int level = int(log(MPISIZE-1) / M_LN2 / 3) + 1;            // Level of local root cell
     if( MPISIZE == 1 ) level = 0;                               // Account for serial case
     for( int i=0; i!=C->NCHILD; i++ ) {                         // Loop over child cells
   void cells2twigs(Cells &cells, Cells &twigs, bool last) {
     while( !cells.empty() ) {                                   // While cell vector is not empty
       if( cells.back().NCHILD == 0 ) {                          //  If cell has no child
-        if( cells.back().NDBODY == 0 || !last ) {               //   If cell has no leaf or is not last iteration
-          cells.back().NDBODY = 0;                              //    Set number of leafs to 0
+        if( cells.back().NDLEAF == 0 || !last ) {               //   If cell has no leaf or is not last iteration
+          cells.back().NDLEAF = 0;                              //    Set number of leafs to 0
           twigs.push_back(cells.back());                        //    Push cell into twig vector
         }                                                       //   Endif for no leaf
       }                                                         //  Endif for no child
       Cell cell;                                                //  Cell structure
       cell.ICELL = JC->ICELL;                                   //  Set index of cell
       cell.M     = JC->M;                                       //  Set multipole of cell
-      cell.CHILD = cell.NCBODY = cell.NDBODY = cell.NCHILD = 0; //  Set number of leafs and children
-      cell.BODY  = bodies.end();                                //  Set pointer to first leaf
+      cell.CHILD = cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0; //  Set number of leafs and children
+      cell.LEAF  = bodies.end();                                //  Set pointer to first leaf
       getCenter(cell);                                          //  Set center and radius
       twigs.push_back(cell);                                    //  Push cell into twig vector
     }                                                           // End loop over send buffer
       Cell cell;                                                //  Cell structure
       cell.ICELL = JC->ICELL;                                   //  Set index of cell
       cell.M     = JC->M;                                       //  Set multipole of cell
-      cell.CHILD = cell.NCBODY = cell.NDBODY = cell.NCHILD = 0; //  Set number of leafs and children
-      cell.BODY  = bodies.end();                                //  Set pointer to first leaf
+      cell.CHILD = cell.NCLEAF = cell.NDLEAF = cell.NCHILD = 0; //  Set number of leafs and children
+      cell.LEAF  = bodies.end();                                //  Set pointer to first leaf
       getCenter(cell);                                          //  Set center and radius
       twigs.push_back(cell);                                    //  Push cell into twig vector
     }                                                           // End loop over recv buffer
       if( twigs.back().ICELL != index ) {                       //  If twig's index is different from previous
         cells.push_back(twigs.back());                          //   Push twig into cell vector
         index = twigs.back().ICELL;                             //   Update index counter
-      } else if ( twigs.back().NDBODY == 0 || !last ) {         //  Elseif twig-twig collision
+      } else if ( twigs.back().NDLEAF == 0 || !last ) {         //  Elseif twig-twig collision
         cells.back().M += twigs.back().M;                       //   Accumulate the multipole
-      } else if ( cells.back().NDBODY == 0 ) {                  //  Elseif twig-body collision
+      } else if ( cells.back().NDLEAF == 0 ) {                  //  Elseif twig-body collision
         Mset M;                                                 //   Multipole for temporary storage
         M = cells.back().M;                                     //   Save multipoles from cells
         cells.back() = twigs.back();                            //   Copy twigs to cells
   void reindexBodies(Bodies &bodies, Cells &twigs, Cells &cells ,Cells &sticks) {
     startTimer("Reindex");                                      // Start timer
     while( !twigs.empty() ) {                                   // While twig vector is not empty
-      if( twigs.back().NDBODY == 0 ) {                          //  If twig has no leafs
+      if( twigs.back().NDLEAF == 0 ) {                          //  If twig has no leafs
         cells.push_back(twigs.back());                          //   Push twig into cell vector
       }                                                         //  Endif for no leafs
       twigs.pop_back();                                         //  Pop last element from twig vector
 
 //! Communicate cells in the local essential tree
   void commCells(Bodies &bodies, Cells &cells) {
-    vec3 xmin = 0, xmax = 0;                                    // Initialize domain boundaries
+    vect xmin = 0, xmax = 0;                                    // Initialize domain boundaries
     Cells twigs,sticks;                                         // Twigs and sticks are special types of cells
 
 #if 1
       if( l == LEVEL - 1 ) {                                    //  If at last level
         complex SUM = 0;                                        //   Initialize accumulator
         for(C_iter C=twigs.begin(); C!=twigs.end(); ++C) {      //   Loop over twigs
-          if( C->NDBODY == 0 ) SUM += C->M[0];                  //    Add multipoles of empty twigs
+          if( C->NDLEAF == 0 ) SUM += C->M[0];                  //    Add multipoles of empty twigs
         }                                                       //   End loop over twigs
         print("Before recv   : ",0);                            //   Print identifier
         print(SUM);                                             //   Print sum of multipoles

File include/partition.h

 
 protected:
   int LEVEL;                                                    //!< Level of the MPI process binary tree
-  std::vector<vec3> XMIN;                                       //!< Minimum position vector of bodies
-  std::vector<vec3> XMAX;                                       //!< Maximum position vector of bodies
+  std::vector<vect> XMIN;                                       //!< Minimum position vector of bodies
+  std::vector<vect> XMAX;                                       //!< Maximum position vector of bodies
   int nprocs[64][2];                                            //!< Number of processes in the two split groups
   int offset[64][2];                                            //!< Offset of body in the two split groups
   int  color[64][3];                                            //!< Color of hypercube communicators
   ~Partition() {}
 
 //! Set bounds of domain to be partitioned
-  void setGlobDomain(Bodies &bodies, vec3 x0=0, real r0=M_PI) {
+  void setGlobDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
     numCells1D = 1 << getMaxLevel(bodies);                      // Set initial number of bodies
     B_iter B = bodies.begin();                                  // Reset body iterator
     XMIN[0] = XMAX[0] = B->X;                                   // Initialize xmin,xmax
         else if(B->X[d] > XMAX[0][d]) XMAX[0][d] = B->X[d];     //   Determine xmax
       }                                                         //  End loop over each dimension
     }                                                           // End loop over bodies
-    vec3 X;                                                     // Recv buffer
+    vect X;                                                     // Recv buffer
     MPI_Allreduce(XMAX[0],X,3,MPI_TYPE,MPI_MAX,MPI_COMM_WORLD); // Reduce global maximum
     XMAX[0] = X;                                                // Get data from buffer
     MPI_Allreduce(XMIN[0],X,3,MPI_TYPE,MPI_MIN,MPI_COMM_WORLD); // Reduce global minimum

File include/serialfmm.h

 
 //! Topdown tree constructor interface. Input: bodies, Output: cells
   void topdown(Bodies &bodies, Cells &cells) {
-#if COMPARE
-    TopDown<equation>::growTree(bodies);
-    TopDown<equation>::linkTree(cells);
-    TopDown<equation>::upwardPass(cells);
-#else
     TopDown<equation>::grow(bodies);                            // Grow tree structure topdown
 
     TopDown<equation>::setIndex();                              // Set index of cells
       for( int i=1; i<MTERM; ++i ) C->M[i] /= C->M[0];
     }
 #endif
-#endif
   }
 
 //! Bottomup tree constructor interface. Input: bodies, Output: cells

File include/topdown.h

 class TopDown : public TreeStructure<equation> {
 public:
   using Kernel<equation>::printNow;                             //!< Switch to print timings
-  using Kernel<equation>::stringLength;                         //!< Max length of event name
   using Kernel<equation>::startTimer;                           //!< Start timer for given event
   using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
   using Kernel<equation>::X0;                                   //!< Center of root cell
   using Kernel<equation>::R0;                                   //!< Radius of root cell
-  using Kernel<equation>::Ci0;                                  //!< Begin iterator for target cells
 
 private:
-//! Binary tree is used for counting number of bodies with a recursive approach
-  struct BinaryTreeNode {
-    ivec8            NBODY;                                     //!< Number of descendant bodies
-    BinaryTreeNode * LEFT;                                      //!< Pointer to left child
-    BinaryTreeNode * RIGHT;                                     //!< Pointer to right child
-    BinaryTreeNode * BEGIN;                                     //!< Pointer to begining of memory space
-    BinaryTreeNode * END;                                       //!< Pointer to end of memory space
+//! Nodes are primitive cells
+  struct Node {
+    int LEVEL;                                                  //!< Level of node
+    int ICHILD;                                                 //!< Flag of empty child nodes
+    int NLEAF;                                                  //!< Number of leafs in node
+    bigint I;                                                   //!< Cell index
+    bigint CHILD[8];                                            //!< Iterator offset of child nodes
+    B_iter LEAF[NCRIT];                                         //!< Iterator for leafs
+    vect X;                                                     //!< Node center
+    real R;                                                     //!< Node radius
   };
+  std::vector<Node> nodes;                                      //!< Nodes in the tree
 
-//! Octree is used for building the FMM tree structure as "nodes", then transformed to "cells" data structure
-  struct OctreeNode {
-    int          BODY;                                          //!< Index offset for first body in node
-    int          NBODY;                                         //!< Number of descendant bodies
-    int          NNODE;                                         //!< Number of descendant nodes
-    OctreeNode * CHILD[8];                                      //!< Pointer to child node
-    vec3         X;                                             //!< Coordinate at center
-  };
-
-  int          MAXLEVEL;                                        //!< Maximum level of tree
-  B_iter       B0;                                              //!< Iterator of first body
-  OctreeNode * N0;                                              //!< Octree root node
-
-private:
-//! Get number of binary tree nodes for a given number of bodies
-  inline int getNumBinNode(int n) const {
-    if (n <= NSPAWN) return 1;                                  // If less then threshold, use only one node
-    else return 4 * ((n - 1) / NSPAWN) - 1;                     // Else estimate number of binary tree nodes
+//! Calculate octant from position
+  int getOctant(const vect X, int i) {
+    int octant = 0;                                             // Initialize octant
+    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
+      octant += (X[d] > nodes[i].X[d]) << d;                    //  interleave bits and accumulate octant
+    }                                                           // End loop over dimensions
+    return octant;                                              // Return octant
   }
 
-//! Get maximum number of binary tree nodes for a given number of bodies
-  inline int getMaxBinNode(int n) const {
-    return (4 * n) / NSPAWN;                                    // Conservative estimate of number of binary tree nodes
+//! Add child node and link it
+  void addChild(const int octant, int i) {
+    bigint pOff = ((1 << 3* nodes[i].LEVEL   ) - 1) / 7;        // Parent cell index offset
+    bigint cOff = ((1 << 3*(nodes[i].LEVEL+1)) - 1) / 7;        // Current cell index offset
+    vect x = nodes[i].X;                                        // Initialize new center position with old center
+    real r = nodes[i].R/2;                                      // Initialize new size
+    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
+      x[d] += r * (((octant & 1 << d) >> d) * 2 - 1);           //  Calculate new center position
+    }                                                           // End loop over dimensions
+    Node node;                                                  // Node structure
+    node.NLEAF = node.ICHILD = 0;                               // Initialize child node counters
+    node.X = x;                                                 // Initialize child node center
+    node.R = r;                                                 // Initialize child node radius
+    node.LEVEL = nodes[i].LEVEL + 1;                            // Level of child node
+    node.I = ((nodes[i].I-pOff) << 3) + octant + cOff;          // Cell index of child node
+    nodes[i].ICHILD |= (1 << octant);                           // Flip bit of octant
+    nodes[i].CHILD[octant] = nodes.end()-nodes.begin();         // Link child node to its parent
+    nodes.push_back(node);                                      // Push child node into vector
   }
 
-//! Exclusive scan with offset
-  inline ivec8 exclusiveScan(ivec8 input, int offset) const {
-    ivec8 output;                                               // Output vector
-    for (int i=0; i<8; i++) {                                   // Loop over elements
-      output[i] = offset;                                       //  Set value
-      offset += input[i];                                       //  Increment offset
-    }                                                           // End loop over elements
-    return output;                                              // Return output vector
+//! Add leaf to node
+  void addLeaf(B_iter b, int i) {
+    nodes[i].LEAF[nodes[i].NLEAF] = b;                          // Assign body iterator to leaf
+    nodes[i].NLEAF++;                                           // Increment leaf counter
   }
 
-//! Create an octree node
-  OctreeNode * makeOctNode(int begin, int end, vec3 X, bool nochild) const {
-    OctreeNode * octNode = new OctreeNode();                    // Allocate memory for single node
-    octNode->BODY = begin;                                      // Index of first body in node
-    octNode->NBODY = end - begin;                               // Number of bodies in node
-    octNode->NNODE = 1;                                         // Initialize counter for decendant nodes
-    octNode->X = X;                                             // Center position of node
-    if (nochild) {                                              // If node has no children
-      for (int i=0; i<8; i++) octNode->CHILD[i] = NULL;         //  Initialize pointers to children
-    }                                                           // End if for node children
-    return octNode;                                             // Return node
+//! Split node and reassign leafs to child nodes
+  void splitNode(int i) {
+    for( int l=0; l!=NCRIT; ++l ) {                             // Loop over leafs in parent node
+      int octant = getOctant(nodes[i].LEAF[l]->X,i);            //  Find the octant where the body belongs
+      if( !(nodes[i].ICHILD & (1 << octant)) ) {                //  If child doesn't exist in this octant
+        addChild(octant,i);                                     //   Add new child to list
+      }                                                         //  Endif for octant
+      int c = nodes[i].CHILD[octant];                           //  Set counter for child node
+      addLeaf(nodes[i].LEAF[l],c);                              //  Add leaf to child
+      if( nodes[c].NLEAF >= NCRIT ) {                           //  If there are still too many leafs
+        splitNode(c);                                           //   Split the node into smaller ones
+      }                                                         //  Endif for too many leafs
+    }                                                           // End loop over leafs
   }
 
-//! Count bodies in each octant using binary tree recursion
-  void countBodies(Bodies& bodies, int begin, int end, vec3 X, BinaryTreeNode * binNode) {
-    assert(getNumBinNode(end - begin) <= binNode->END - binNode->BEGIN + 1);
-    if (end - begin <= NSPAWN) {                                // If number of bodies is less than threshold
-      for (int i=0; i<8; i++) binNode->NBODY[i] = 0;            //  Initialize number of bodies in octant
-      binNode->LEFT = binNode->RIGHT = NULL;                    //  Initialize pointers to left and right child node
-      for (int i=begin; i<end; i++) {                           //  Loop over bodies in node
-        vec3 x = bodies[i].X;                                   //   Position of body
-        int octant = (x[0] > X[0]) + ((x[1] > X[1]) << 1) + ((x[2] > X[2]) << 2);// Which octant body belongs to
-        binNode->NBODY[octant]++;                               //   Increment body count in octant
-      }                                                         //  End loop over bodies in node
-    } else {                                                    // Else if number of bodies is larger than threshold
-      int mid = (begin + end) / 2;                              //  Split range of bodies in half
-      int numLeftNode = getNumBinNode(mid - begin);             //  Number of binary tree nodes on left branch
-      int numRightNode = getNumBinNode(end - mid);              //  Number of binary tree nodes on right branch
-      assert(numLeftNode + numRightNode <= binNode->END - binNode->BEGIN);
-      binNode->LEFT = binNode->BEGIN;                           //  Assign first memory address to left node pointer
-      binNode->LEFT->BEGIN = binNode->LEFT + 1;                 //  Assign next memory address to left begin pointer
-      binNode->LEFT->END = binNode->LEFT + numLeftNode;         //  Keep track of last memory address used by left
-      binNode->RIGHT = binNode->LEFT->END;                      //  Assign that same address to right node pointer
-      binNode->RIGHT->BEGIN = binNode->RIGHT + 1;               //  Assign next memory address to right begin pointer
-      binNode->RIGHT->END = binNode->RIGHT + numRightNode;      //  Keep track of last memory address used by right
-      countBodies(bodies, begin, mid, X, binNode->LEFT);        //  Recursive call for left branch
-      countBodies(bodies, mid, end, X, binNode->RIGHT);         //  Recursive call for right branch
-      binNode->NBODY = binNode->LEFT->NBODY + binNode->RIGHT->NBODY;// Sum contribution from both branches
-    }                                                           // End if for number of bodies
+//! Traverse tree
+  void traverse(typename std::vector<Node>::iterator N) {
+    if( N->NLEAF >= NCRIT ) {                                   // If node has children
+      for( int i=0; i!=8; ++i ) {                               // Loop over children
+        if( N->ICHILD & (1 << i) ) {                            //  If child exists in this octant
+          traverse(nodes.begin()+N->CHILD[i]);                  //   Recursively search child node
+        }                                                       //  Endif for octant
+      }                                                         // End loop over children
+    } else {                                                    //  If child doesn't exist
+      for( int i=0; i!=N->NLEAF; ++i ) {                        //   Loop over leafs
+        N->LEAF[i]->ICELL = N->I;                               //    Store cell index in bodies
+      }                                                         //   End loop over leafs
+    }                                                           //  Endif for child existence
   }
 
-//! Sort bodies according to octant (Morton order)
-  void moveBodies(Bodies& bodies, Bodies& buffer, int begin, int end,
-                  BinaryTreeNode * binNode, ivec8 octantOffset, vec3 X) const {
-    if (binNode->LEFT == NULL) {                                // If there are no more child nodes
-      for (int i=begin; i<end; i++) {                           //  Loop over bodies
-        vec3 x = bodies[i].X;                                   //   Position of body
-        int octant = (x[0] > X[0]) + ((x[1] > X[1]) << 1) + ((x[2] > X[2]) << 2);// Which octant body belongs to`
-        buffer[octantOffset[octant]] = bodies[i];               //   Permute bodies out-of-place according to octant
-        octantOffset[octant]++;                                 //   Increment body count in octant
-      }                                                         //  End loop over bodies
-    } else {                                                    // Else if there are child nodes
-      int mid = (begin + end) / 2;                              //  Split range of bodies in half
-      moveBodies(bodies, buffer, begin, mid, binNode->LEFT, octantOffset, X);// Recursive call for left branch
-      octantOffset += binNode->LEFT->NBODY;                     //  Increment the octant offset for right branch
-      moveBodies(bodies, buffer, mid, end, binNode->RIGHT, octantOffset, X);// Recursive call for right branch
-    }                                                           // End if for child existance
-  }
-
-//! Build nodes of octree adaptively using a top-down approach based on recursion (uses task based thread parallelism)
-  OctreeNode * buildNodes(Bodies& bodies, Bodies& buffer, int begin, int end,
-                          BinaryTreeNode * binNode, vec3 X, int level=0, bool direction=false) {
-    assert(getMaxBinNode(end - begin) <= binNode->END - binNode->BEGIN);
-    if (begin == end) return NULL;                              // If no bodies left, return null pointer
-    if (end - begin <= NCRIT) {                                 // If number of bodies is less than threshold
-      if (direction)                                            //  If direction of data is from bodies to buffer
-        for (int i=begin; i<end; i++) buffer[i] = bodies[i];    //   Copy bodies to buffer
-      return makeOctNode(begin, end, X, true);                  //  Create an octree node and return it's pointer
-    }                                                           // End if for number of bodies
-    OctreeNode * octNode = makeOctNode(begin, end, X, false);   // Create an octree node with child nodes
-    countBodies(bodies, begin, end, X, binNode);                // Count bodies in each octant using binary recursion
-    ivec8 octantOffset = exclusiveScan(binNode->NBODY, begin);  // Exclusive scan to obtain offset from octant count
-    moveBodies(bodies, buffer, begin, end, binNode, octantOffset, X);// Sort bodies according to octant
-    BinaryTreeNode * binNodeOffset = binNode->BEGIN;            // Initialize pointer offset for binary tree nodes
-    for (int i=0; i<8; i++) {                                   // Loop over children
-      int maxBinNode = getMaxBinNode(binNode->NBODY[i]);        //  Get maximum number of binary tree nodes
-      assert(binNodeOffset + maxBinNode <= binNode->END);
-      vec3 Xchild = X;                                          //  Initialize center position of child node
-      real r = R0 / (1 << (level + 1));                         //  Radius of cells for child's level
-      for (int d=0; d<3; d++) {                                 //  Loop over dimensions
-        Xchild[d] += r * (((i & 1 << d) >> d) * 2 - 1);         //   Shift center position to that of child node
-      }                                                         //  End loop over dimensions
-      BinaryTreeNode binNodeChild[1];                           //  Allocate new root for this branch
-      binNodeChild->BEGIN = binNodeOffset;                      //  Assign first memory address from offset
-      binNodeChild->END = binNodeOffset + maxBinNode;           //  Keep track of last memory address
-      octNode->CHILD[i] = buildNodes(buffer, bodies,            //  Recursive call for each child
-        octantOffset[i], octantOffset[i] + binNode->NBODY[i],   //  Range of bodies is calcuated from octant offset
-        binNodeChild, Xchild, level+1, !direction);             //  Alternate copy direction bodies <-> buffer
-      binNodeOffset += maxBinNode;                              //  Increment offset for binNode memory address
-    }                                                           // End loop over children
-    for (int i=0; i<8; i++) {                                   // Loop over children
-      if (octNode->CHILD[i]) octNode->NNODE += octNode->CHILD[i]->NNODE;// If child exists increment child node counter
-    }                                                           // End loop over chlidren
-    return octNode;                                             // Return octree node
-  }
-
-//! Create cell data structure from nodes
-  void nodes2cells(OctreeNode * octNode, C_iter C, C_iter CN, int level=0, int iparent=0) {
-    C->PARENT = iparent;                                        // Index of parent cell
-    C->R      = R0 / (1 << level);                     // Cell radius
-    C->X      = octNode->X;                                     // Cell center
-    C->NDBODY = octNode->NBODY;                                 // Number of decendant bodies
-    C->BODY   = B0 + octNode->BODY;                             // Iterator of first body in cell
-    if (octNode->NNODE == 1) {                                  // If node has no children
-      C->CHILD  = 0;                                            //  Set index of first child cell to zero
-      C->NCHILD = 0;                                            //  Number of child cells
-      C->NCBODY = octNode->NBODY;                               //  Number of bodies in cell
-      assert(C->NCBODY > 0);
-      MAXLEVEL = std::max(MAXLEVEL, level);                     //  Update maximum level of tree
-    } else {                                                    // Else if node has children
-      C->NCBODY = 0;                                            //  Set number of bodies in cell to zero
-      int nchild = 0;                                           //  Initialize number of child cells
-      int octants[8];                                           //  Map of child index to octants (for when nchild < 8)
-      for (int i=0; i<8; i++) {                                 //  Loop over octants
-        if (octNode->CHILD[i]) {                                //   If child exists for that octant
-          octants[nchild] = i;                                  //    Map octant to child index
-          nchild++;                                             //    Increment child cell counter
-        }                                                       //   End if for child existance
-      }                                                         //  End loop over octants
-      C_iter Ci = CN;                                           //  CN points to the next free memory address
-      C->CHILD = Ci - Ci0;                                      //  Set Index of first child cell
-      C->NCHILD = nchild;                                       //  Number of child cells
-      assert(C->NCHILD > 0);
-      CN += nchild;                                             //  Increment next free memory address
-      for (int i=0; i<nchild; i++) {                            //  Loop over children
-        int octant = octants[i];                                //   Get octant from child index
-        nodes2cells(octNode->CHILD[octant], Ci, CN, level+1, C-Ci0);// Recursive call for each child
-        Ci++;                                                   //   Increment cell iterator
-        CN += octNode->CHILD[octant]->NNODE - 1;                //   Increment next free memory address
-      }                                                         //  End loop over children
-      for (int i=0; i<nchild; i++) {                            //  Loop over children
-        int octant = octants[i];                                //   Get octant from child index
-        delete octNode->CHILD[octant];                          //   Free child pointer to avoid memory leak
-      }                                                         //  End loop over children
-      MAXLEVEL = std::max(MAXLEVEL, level+1);                   //  Update maximum level of tree
-    }                                                           // End if for child existance
-  }
-
-//! Get Xmin and Xmax of domain
-  vec3Pair getBounds(B_iter BiBegin, B_iter BiEnd) {
-    assert(BiEnd - BiBegin > 0);
-    if (BiEnd - BiBegin < NSPAWN) {                             // If number of elements is small enough
-      vec3 Xmin = BiBegin->X, Xmax = BiBegin->X;                //  Initialize Xmin and Xmax with first value
-      for (B_iter B=BiBegin; B!=BiEnd; B++) {                   //  Loop over range of bodies
-        Xmin = min(B->X, Xmin);                                 //   Update Xmin
-        Xmax = max(B->X, Xmax);                                 //   Update Xmax
-      }                                                         //  End loop over range of bodies
-      return vec3Pair(Xmin, Xmax);                              //  Return Xmin and Xmax as pair
-    } else {                                                    // Else if number of elements are large
-      B_iter BiMid = BiBegin + (BiEnd - BiBegin) / 2;           //  Middle iterator
-      vec3Pair bounds0, bounds1;                                //  Pair : first Xmin : second Xmax
-      bounds0 = getBounds(BiBegin, BiMid);                      //  Recursive call with new task
-      bounds1 = getBounds(BiMid, BiEnd);                        //  Recursive call with old task
-      bounds0.first = min(bounds0.first, bounds1.first);        //  Minimum of the two Xmins
-      bounds0.second = max(bounds0.second, bounds1.second);     //  Maximum of the two Xmaxs
-      return bounds0;                                           //  Return Xmin and Xmax
-    }                                                           // End if for number fo elements
-  }
-
-protected:
-//! Grow tree structure top down
-  void growTree(Bodies &bodies) {
-    Bodies buffer = bodies;                                     // Copy bodies to buffer
+public:
+//! Grow tree from root
+  void grow(Bodies &bodies) {
     startTimer("Grow tree");                                    // Start timer
-    B0 = bodies.begin();                                        // Bodies iterator
-    BinaryTreeNode binNode[1];                                  // Allocate root node of binary tree
-    int maxBinNode = getMaxBinNode(bodies.size());              // Get maximum size of binary tree
-    binNode->BEGIN = new BinaryTreeNode[maxBinNode];            // Allocate array for binary tree nodes
-    binNode->END = binNode->BEGIN + maxBinNode;                 // Set end pointer
-    N0 = buildNodes(bodies, buffer, 0, bodies.size(), binNode, X0);// Build tree recursively
-    delete[] binNode->BEGIN;                                    // Deallocate binary tree array
+    int octant;                                                 // In which octant is the body located?
+    Node node;                                                  // Node structure
+    node.LEVEL = node.NLEAF = node.ICHILD = node.I = 0;         // Initialize root node counters
+    node.X = X0;                                                // Initialize root node center
+    node.R = R0;                                                // Initialize root node radius
+    nodes.push_back(node);                                      // Push child node into vector
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      int i = 0;                                                //  Reset node counter
+      while( nodes[i].NLEAF >= NCRIT ) {                        //  While the nodes have children
+        nodes[i].NLEAF++;                                       //   Increment the cumulative leaf counter
+        octant = getOctant(B->X,i);                             //   Find the octant where the body belongs
+        if( !(nodes[i].ICHILD & (1 << octant)) ) {              //   If child doesn't exist in this octant
+          addChild(octant,i);                                   //    Add new child to list
+        }                                                       //   Endif for child existence
+        i = nodes[i].CHILD[octant];                             //    Update node iterator to child
+      }                                                         //  End while loop
+      addLeaf(B,i);                                             //  Add body to node as leaf
+      if( nodes[i].NLEAF >= NCRIT ) {                           //  If there are too many leafs
+        splitNode(i);                                           //   Split the node into smaller ones
+      }                                                         //  Endif for splitting
+    }                                                           // End loop over bodies
     stopTimer("Grow tree",printNow);                            // Stop timer
   }
 
-//! Link tree structure
-  void linkTree(Cells &cells) {
-    startTimer("Link tree");                                    // Start timer
-    cells.resize(N0->NNODE);                                    // Allocate cells array
-    Ci0 = cells.begin();                                        // Cell begin iterator
-    nodes2cells(N0, Ci0, Ci0+1);                                // Convert nodes to cells recursively
-    delete N0;                                                  // Deallocate nodes
-    stopTimer("Link tree",printNow);                            // Stop timer
+//! Store cell index of all bodies
+  void setIndex() {
+    startTimer("Set index");                                    // Start timer
+    traverse(nodes.begin());                                    // Traverse tree
+    stopTimer("Set index",printNow);                            // Stop timer 
   }
-
-  void printTreeData(Cells &cells) {
-    std::cout << "--- FMM stats --------------------" << std::endl
-    << std::setw(stringLength) << std::left                      // Set format
-    << "Bodies"     << " : " << cells.front().NDBODY << std::endl// Print number of bodies
-    << std::setw(stringLength) << std::left                      // Set format
-    << "Cells"      << " : " << cells.size() << std::endl       // Print number of cells
-    << std::setw(stringLength) << std::left                      // Set format
-    << "Tree depth" << " : " << MAXLEVEL << std::endl           // Print number of levels
-#if COUNT
-    << std::setw(stringLength) << std::left                      // Set format
-    << "P2P calls"  << " : " << NP2P << std::endl               // Print number of P2P calls
-    << std::setw(stringLength) << std::left                      // Set format
-    << "M2L calls"  << " : " << NM2L << std::endl               // Print number of M2l calls
-#endif
-    << "--- FMM stats --------------------" << std::endl;
-  }
-public:
-  TopDown() : MAXLEVEL(0) {}
-
 };
 
 #endif

File include/topdown2.h

-/*
-Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#ifndef topdown_h
-#define topdown_h
-#include "tree.h"
-
-//! Topdown tree constructor class
-template<Equation equation>
-class TopDown : public TreeStructure<equation> {
-public:
-  using Kernel<equation>::printNow;                             //!< Switch to print timings
-  using Kernel<equation>::startTimer;                           //!< Start timer for given event
-  using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
-  using Kernel<equation>::X0;                                   //!< Center of root cell
-  using Kernel<equation>::R0;                                   //!< Radius of root cell
-
-private:
-//! Nodes are primitive cells
-  struct Node {
-    int LEVEL;                                                  //!< Level of node
-    int ICHILD;                                                 //!< Flag of empty child nodes
-    int NBODY;                                                  //!< Number of leafs in node
-    bigint I;                                                   //!< Cell index
-    bigint CHILD[8];                                            //!< Iterator offset of child nodes
-    B_iter BODY[NCRIT];                                         //!< Iterator for leafs
-    vec3 X;                                                     //!< Node center
-    real R;                                                     //!< Node radius
-  };
-  std::vector<Node> nodes;                                      //!< Nodes in the tree
-
-//! Calculate octant from position
-  int getOctant(const vec3 X, int i) {
-    int octant = 0;                                             // Initialize octant
-    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
-      octant += (X[d] > nodes[i].X[d]) << d;                    //  interleave bits and accumulate octant
-    }                                                           // End loop over dimensions
-    return octant;                                              // Return octant
-  }
-
-//! Add child node and link it
-  void addChild(const int octant, int i) {
-    bigint pOff = ((1 << 3* nodes[i].LEVEL   ) - 1) / 7;        // Parent cell index offset
-    bigint cOff = ((1 << 3*(nodes[i].LEVEL+1)) - 1) / 7;        // Current cell index offset
-    vec3 x = nodes[i].X;                                        // Initialize new center position with old center
-    real r = nodes[i].R/2;                                      // Initialize new size
-    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
-      x[d] += r * (((octant & 1 << d) >> d) * 2 - 1);           //  Calculate new center position
-    }                                                           // End loop over dimensions
-    Node node;                                                  // Node structure
-    node.NBODY = node.ICHILD = 0;                               // Initialize child node counters
-    node.X = x;                                                 // Initialize child node center
-    node.R = r;                                                 // Initialize child node radius
-    node.LEVEL = nodes[i].LEVEL + 1;                            // Level of child node
-    node.I = ((nodes[i].I-pOff) << 3) + octant + cOff;          // Cell index of child node
-    nodes[i].ICHILD |= (1 << octant);                           // Flip bit of octant
-    nodes[i].CHILD[octant] = nodes.end()-nodes.begin();         // Link child node to its parent
-    nodes.push_back(node);                                      // Push child node into vector
-  }
-
-//! Add leaf to node
-  void addLeaf(B_iter b, int i) {
-    nodes[i].BODY[nodes[i].NBODY] = b;                          // Assign body iterator to leaf
-    nodes[i].NBODY++;                                           // Increment leaf counter
-  }
-
-//! Split node and reassign leafs to child nodes
-  void splitNode(int i) {
-    for( int l=0; l!=NCRIT; ++l ) {                             // Loop over leafs in parent node
-      int octant = getOctant(nodes[i].BODY[l]->X,i);            //  Find the octant where the body belongs
-      if( !(nodes[i].ICHILD & (1 << octant)) ) {                //  If child doesn't exist in this octant
-        addChild(octant,i);                                     //   Add new child to list
-      }                                                         //  Endif for octant
-      int c = nodes[i].CHILD[octant];                           //  Set counter for child node
-      addLeaf(nodes[i].BODY[l],c);                              //  Add leaf to child
-      if( nodes[c].NBODY >= NCRIT ) {                           //  If there are still too many leafs
-        splitNode(c);                                           //   Split the node into smaller ones
-      }                                                         //  Endif for too many leafs
-    }                                                           // End loop over leafs
-  }
-
-//! Traverse tree
-  void traverse(typename std::vector<Node>::iterator N) {
-    if( N->NBODY >= NCRIT ) {                                   // If node has children
-      for( int i=0; i!=8; ++i ) {                               // Loop over children
-        if( N->ICHILD & (1 << i) ) {                            //  If child exists in this octant
-          traverse(nodes.begin()+N->CHILD[i]);                  //   Recursively search child node
-        }                                                       //  Endif for octant
-      }                                                         // End loop over children
-    } else {                                                    //  If child doesn't exist
-      for( int i=0; i!=N->NBODY; ++i ) {                        //   Loop over leafs
-        N->BODY[i]->ICELL = N->I;                               //    Store cell index in bodies
-      }                                                         //   End loop over leafs
-    }                                                           //  Endif for child existence
-  }
-
-public:
-//! Grow tree from root
-  void grow(Bodies &bodies) {
-    startTimer("Grow tree");                                    // Start timer
-    int octant;                                                 // In which octant is the body located?
-    Node node;                                                  // Node structure
-    node.LEVEL = node.NBODY = node.ICHILD = node.I = 0;         // Initialize root node counters
-    node.X = X0;                                                // Initialize root node center
-    node.R = R0;                                                // Initialize root node radius
-    nodes.push_back(node);                                      // Push child node into vector
-    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
-      int i = 0;                                                //  Reset node counter
-      while( nodes[i].NBODY >= NCRIT ) {                        //  While the nodes have children
-        nodes[i].NBODY++;                                       //   Increment the cumulative leaf counter
-        octant = getOctant(B->X,i);                             //   Find the octant where the body belongs
-        if( !(nodes[i].ICHILD & (1 << octant)) ) {              //   If child doesn't exist in this octant
-          addChild(octant,i);                                   //    Add new child to list
-        }                                                       //   Endif for child existence
-        i = nodes[i].CHILD[octant];                             //    Update node iterator to child
-      }                                                         //  End while loop
-      addLeaf(B,i);                                             //  Add body to node as leaf
-      if( nodes[i].NBODY >= NCRIT ) {                           //  If there are too many leafs
-        splitNode(i);                                           //   Split the node into smaller ones
-      }                                                         //  Endif for splitting
-    }                                                           // End loop over bodies
-    stopTimer("Grow tree",printNow);                            // Stop timer
-  }
-
-//! Store cell index of all bodies
-  void setIndex() {
-    startTimer("Set index");                                    // Start timer
-    traverse(nodes.begin());                                    // Traverse tree
-    stopTimer("Set index",printNow);                            // Stop timer 
-  }
-};
-
-#endif

File include/tree.h

   using Kernel<equation>::sortCells;                            //!< Sort cells according to cell index
   using Kernel<equation>::X0;                                   //!< Center of root cell
   using Kernel<equation>::R0;                                   //!< Radius of root cell
-  using Kernel<equation>::Ci0;                                  //!< Begin iterator for target cells
-  using Kernel<equation>::Cj0;                                  //!< Begin iterator for source cells
   using Evaluator<equation>::NP2P;                              //!< Number of P2P kernel calls
   using Evaluator<equation>::NM2P;                              //!< Number of M2P kernel calls
   using Evaluator<equation>::NM2L;                              //!< Number of M2L kernel calls
   using Evaluator<equation>::traverse;                          //!< Traverse tree to get interaction list
   using Evaluator<equation>::traversePeriodic;                  //!< Traverse tree for periodic images
   using Evaluator<equation>::neighbor;                          //!< Traverse source tree to get neighbor list
-  using Evaluator<equation>::P2M;                               //!< Evaluate P2M kernel
-  using Evaluator<equation>::M2M;                               //!< Evaluate M2M kernel
-  using Evaluator<equation>::L2L;                               //!< Evaluate L2L kernel
-  using Evaluator<equation>::L2P;                               //!< Evaluate L2P kernel
   using Evaluator<equation>::evalP2M;                           //!< Evaluate P2M kernel
   using Evaluator<equation>::evalM2M;                           //!< Evaluate M2M kernel
   using Evaluator<equation>::evalM2L;                           //!< Evaluate M2L kernel
       } else if( c != c_old ) {                                 //  If cell index is repeated
         if( cells[c].NCHILD != 0 ) {                            //   Stick-cell collision
           cells[c_old].NCHILD = cells[c].NCHILD;                //    Copy number of children
-          cells[c_old].NCBODY = cells[c].NCBODY;                //    Copy number of leafs
-          cells[c_old].NDBODY = cells[c].NDBODY;                //    Copy number of leafs
+          cells[c_old].NCLEAF = cells[c].NCLEAF;                //    Copy number of leafs
+          cells[c_old].NDLEAF = cells[c].NDLEAF;                //    Copy number of leafs
           cells[c_old].PARENT = cells[c].PARENT;                //    Copy parent link
           cells[c_old].CHILD = cells[c].CHILD;                  //    Copy child link
-          cells[c_old].BODY = cells[c].BODY;                    //    Copy iterator of first leaf
+          cells[c_old].LEAF = cells[c].LEAF;                    //    Copy iterator of first leaf
           sticks.push_back(cells[c_old]);                       //    Push stick into vector
         }                                                       //   Endif for collision type
         cells[c_old].M += cells[c].M;                           //   Accumulate multipole
     parent.ICELL = getParent(cells[begin].ICELL);               // Set cell index
     parent.M = 0;                                               // Initialize multipole coefficients
     parent.L = 0;                                               // Initlalize local coefficients
-    parent.NCBODY = parent.NDBODY = parent.NCHILD = 0;          // Initialize NCBODY, NDBODY, & NCHILD
-    parent.BODY = cells[begin].BODY;                            // Set pointer to first leaf
+    parent.NCLEAF = parent.NDLEAF = parent.NCHILD = 0;          // Initialize NCLEAF, NDLEAF, & NCHILD
+    parent.LEAF = cells[begin].LEAF;                            // Set pointer to first leaf
     parent.CHILD = begin;                                       // Link to child
     getCenter(parent);                                          // Set cell center and radius
     for( int i=begin; i!=oldend; ++i ) {                        // Loop over cells at this level
         parent.ICELL = getParent(cells[i].ICELL);               //   Set cell index
         parent.M = 0;                                           //   Initialize multipole coefficients
         parent.L = 0;                                           //   Initialize local coefficients
-        parent.NCBODY = parent.NDBODY = parent.NCHILD = 0;      //   Initialize NCBODY, NDBODY, & NCHILD
-        parent.BODY = cells[i].BODY;                            //   Set pointer to first leaf
+        parent.NCLEAF = parent.NDLEAF = parent.NCHILD = 0;      //   Initialize NCLEAF, NDLEAF, & NCHILD
+        parent.LEAF = cells[i].LEAF;                            //   Set pointer to first leaf
         parent.CHILD = i;                                       //   Link to child
         getCenter(parent);                                      //   Set cell center and radius
       }                                                         //  Endif for new parent cell
         (cells.begin()+cells[i].CHILD+c)->PARENT = i;           //   Link child to current
       }                                                         //  End loop over child cells
       cells[i].PARENT = end;                                    //  Link to current to parent
-      parent.NDBODY += cells[i].NDBODY;                         //  Add nleaf of child to parent
+      parent.NDLEAF += cells[i].NDLEAF;                         //  Add nleaf of child to parent
       parents.push_back(parent);                                //  Push parent cell into vector
       parent = parents.back();                                  //  Copy information from vector
       parents.pop_back();                                       //  Pop parent cell from vector
     begin = oldend;                                             // Set new begin index to old end index
   }
 
-//! Recursive call for upward pass
-  void upwardRecursion(C_iter C, C_iter C0) {
-    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
-      upwardRecursion(CC, C0);                                  //  Recursive call with new task
-    }                                                           // End loop over child cells
-    C->M = 0;                                                   // Initialize multipole expansion coefficients
-    C->L = 0;                                                   // Initialize local expansion coefficients
-    P2M(C);                                                     // P2M kernel
-    M2M(C);                                                     // M2M kernel
-  }
-
-//! Recursive call for downward pass
-  void downwardRecursion(C_iter C, C_iter C0) const {
-    L2L(C);                                                     // L2L kernel
-    L2P(C);                                                     // L2P kernel
-    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
-      downwardRecursion(CC, C0);                                //  Recursive call with new task
-    }                                                           // End loop over chlid cells
-  }
-
-//! Error optimization of Rcrit
-  void setRcrit(C_iter C, C_iter C0, real c) {
-    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
-      setRcrit(CC, C0, c);                                      //  Recursive call with new task
-    }                                                           // End loop over child cells
-    for( int i=1; i<MTERM; ++i ) C->M[i] /= C->M[0];            // Normalize multipole expansion coefficients
-    real x = 1.0 / THETA;                                       // Inverse of theta
-#if ERROR_OPT
-    real a = c * powf(std::abs(C->M[0]),1.0/3);                 // Cell coefficient
-    for (int i=0; i<5; i++) {                                   // Newton-Rhapson iteration
-      real f = x * x - 2 * x + 1 - a * pow(x,-P);               //  Function value
-      real df = (P + 2) * x - 2 * (P + 1) + P / x;              //  Function derivative value
-      x -= f / df;                                              //  Increment x
-    }                                                           // End Newton-Rhapson iteration
-#endif
-    C->RCRIT *= x;                                              // Multiply Rcrit by error optimized parameter x
-  }
-
 protected:
 //! Get cell center and radius from cell index
   void getCenter(Cell &cell) {
     }                                                           // End loop over dimensions
   }
 
-//! Upward pass (P2M, M2M)
-  void upwardPass(Cells &cells) {
-    startTimer("Upward pass");                                  // Start timer
-    Ci0 = cells.begin();                                        // Set iterator of target root cell
-    Cj0 = cells.begin();                                        // Set iterator of source root cell
-    upwardRecursion(Ci0, Ci0);                                  // Recursive call for upward pass
-    real c = (1 - THETA) * (1 - THETA) / pow(THETA,P+2) / powf(std::abs(Ci0->M[0]),1.0/3); // Root coefficient
-    setRcrit(Ci0, Ci0, c);                                      // Error optimization of Rcrit
-    for (C_iter C=cells.begin(); C!=cells.begin()+9; C++) {     // Loop over top 2 levels of cells
-      C->RCRIT *= 10;                                           //  Prevent approximation
-    }                                                           // End loop over top 2 levels of cells
-    stopTimer("Upward pass",printNow);                          // Stop timer
-  }
-
-//! Downward pass (L2L, L2P)
-  void downwardPass(Cells &cells) {
-    startTimer("Downward pass");                                // Start timer
-    C_iter C0 = cells.begin();                                  // Root cell
-    L2P(C0);                                                    // If root is the only cell do L2P
-    for (C_iter CC=C0+C0->CHILD; CC!=C0+C0->CHILD+C0->NCHILD; CC++) {// Loop over child cells
-      downwardRecursion(CC, C0);                                //  Recursive call for downward pass
-    }                                                           // End loop over child cells
-    stopTimer("Downward pass",printNow);                        // Stop timer
-  }
-
 public:
 //! Group bodies into twig cells
   void bodies2twigs(Bodies &bodies, Cells &twigs) {
     Cell cell;                                                  // Cell structure
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
       if( B->ICELL != index ) {                                 //  If it belongs to a new cell
-        cell.NCBODY = nleaf;                                    //   Set number of child leafs
-        cell.NDBODY = nleaf;                                    //   Set number of decendant leafs
+        cell.NCLEAF = nleaf;                                    //   Set number of child leafs
+        cell.NDLEAF = nleaf;                                    //   Set number of decendant leafs
         cell.CHILD  = 0;                                        //   Set pointer offset to first child
         cell.NCHILD = 0;                                        //   Set number of child cells
         cell.ICELL  = index;                                    //   Set cell index
-        cell.BODY   = firstLeaf;                                //   Set pointer to first leaf
+        cell.LEAF   = firstLeaf;                                //   Set pointer to first leaf
         getCenter(cell);                                        //   Set cell center and radius
         twigs.push_back(cell);                                  //   Push cells into vector
         firstLeaf = B;                                          //   Set new first leaf
       }                                                         //  Endif for new cell
       nleaf++;                                                  //  Increment body counter
     }                                                           // End loop over bodies
-    cell.NCBODY = nleaf;                                        // Set number of child leafs
-    cell.NDBODY = nleaf;                                        // Set number of decendant leafs
+    cell.NCLEAF = nleaf;                                        // Set number of child leafs
+    cell.NDLEAF = nleaf;                                        // Set number of decendant leafs
     cell.CHILD  = 0;                                            //   Set pointer offset to first child
     cell.NCHILD = 0;                                            // Set number of child cells
     cell.ICELL  = index;                                        // Set cell index
-    cell.BODY   = firstLeaf;                                    // Set pointer to first leaf
+    cell.LEAF   = firstLeaf;                                    // Set pointer to first leaf
     getCenter(cell);                                            // Set cell center and radius
     twigs.push_back(cell);                                      // Push cells into vector
     stopTimer("Bodies2twigs",printNow);                         // Stop timer & print
     evalM2P(cells);                                             // Evaluate queued M2P kernels (only GPU)
     evalP2P(cells);                                             // Evaluate queued P2P kernels (only GPU)
 #endif
-#if COMPARE
-    downwardPass(cells);
-#else
     evalL2L(cells);                                             // Evaluate all L2L kernels
     evalL2P(cells);                                             // Evaluate all L2P kernels
-#endif
     if(printNow) std::cout << "P2P: "  << NP2P
                            << " M2P: " << NM2P
                            << " M2L: " << NM2L << std::endl;

File include/types.h

 }
 #define OMP_NUM_THREADS 1
 #else
-//#include <omp.h>
-//#define OMP_NUM_THREADS 12
+#include <omp.h>
+#define OMP_NUM_THREADS 12
 #endif
 
 typedef unsigned           bigint;                              //!< Big integer type
 typedef float              real;                                //!< Real number type on CPU
 typedef float              gpureal;                             //!< Real number type on GPU
 typedef std::complex<real> complex;                             //!< Complex number type
-typedef vec<3,real>        vec3;                                //!< 3-D vector type
-typedef vec<3,float>       fvec3;                               //!< Vector of 3 single precision types
-typedef vec<8,int>         ivec8;                               //!< Vector of 8 integer types
-typedef std::pair<vec3,vec3> vec3Pair;                          //!< Pair of vec3
+typedef vec<3,real>        vect;                                //!< 3-D vector type
+
 
 #ifndef KERNEL
 int MPIRANK    = 0;                                             //!< MPI comm rank
 int DEVICE     = 0;                                             //!< GPU device ID
 int IMAGES     = 0;                                             //!< Number of periodic image sublevels
 real THETA     = .5;                                            //!< Multipole acceptance criteria
-vec3 Xperiodic = 0;                                             //!< Coordinate offset of periodic image
+vect Xperiodic = 0;                                             //!< Coordinate offset of periodic image
 #if PAPI
 int PAPIEVENT  = PAPI_NULL;                                     //!< PAPI event handle
 #endif
 extern int DEVICE;                                              //!< GPU device ID
 extern int IMAGES;                                              //!< Number of periodic image sublevels
 extern real THETA;                                              //!< Multipole acceptance criteria
-extern vec3 Xperiodic;                                          //!< Coordinate offset of periodic image
+extern vect Xperiodic;                                          //!< Coordinate offset of periodic image
 #if PAPI
 extern int PAPIEVENT;                                           //!< PAPI event handle
 #endif
 
 const int  P        = 10;                                       //!< Order of expansions
 const int  NCRIT    = 100;                                      //!< Number of bodies per cell
-const int  NSPAWN   = 1000;                                     //!< Threshold of NDBODY for spawning new threads
 const int  MAXBODY  = 200000;                                   //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel
 const real CLET     = 2;                                        //!< LET opening critetia
   int         IBODY;                                            //!< Initial body numbering for sorting back
   int         IPROC;                                            //!< Initial process numbering for partitioning back
   bigint      ICELL;                                            //!< Cell index
-  vec3        X;                                                //!< Position
+  vect        X;                                                //!< Position
   real        SRC;                                              //!< Scalar source values
 };
 typedef std::vector<JBody>             JBodies;                 //!< Vector of source bodies
 //! Linked list of leafs (only used in fast/topdown.h)
 struct Leaf {
   int I;                                                        //!< Unique index for every leaf
-  vec3 X;                                                       //!< Coordinate of leaf
+  vect X;                                                       //!< Coordinate of leaf
   Leaf *NEXT;                                                   //!< Pointer to next leaf
 };
 typedef std::vector<Leaf>              Leafs;                   //!< Vector of leafs
 struct Node {
   bool NOCHILD;                                                 //!< Flag for twig nodes
   int  LEVEL;                                                   //!< Level in the tree structure
-  int  NBODY;                                                   //!< Number of descendant leafs
+  int  NLEAF;                                                   //!< Number of descendant leafs
   int  CHILD[8];                                                //!< Index of child node
-  vec3 X;                                                       //!< Coordinate at center
-  Leaf *BODY;                                                   //!< Pointer to first leaf
+  vect X;                                                       //!< Coordinate at center
+  Leaf *LEAF;                                                   //!< Pointer to first leaf
 };
 typedef std::vector<Node>              Nodes;                   //!< Vector of nodes
 typedef std::vector<Node>::iterator    N_iter;                  //!< Iterator for node vector
 struct Cell {
   bigint   ICELL;                                               //!< Cell index
   int      NCHILD;                                              //!< Number of child cells
-  int      NCBODY;                                              //!< Number of child leafs
-  int      NDBODY;                                              //!< Number of descendant leafs
+  int      NCLEAF;                                              //!< Number of child leafs
+  int      NDLEAF;                                              //!< Number of descendant leafs
   int      PARENT;                                              //!< Iterator offset of parent cell
   int      CHILD;                                               //!< Iterator offset of child cells
-  B_iter   BODY;                                                //!< Iterator of first leaf
-  vec3     X;                                                   //!< Cell center
+  B_iter   LEAF;                                                //!< Iterator of first leaf
+  vect     X;                                                   //!< Cell center
   real     R;                                                   //!< Cell radius
   real     RMAX;                                                //!< Max cell radius
   real     RCRIT;                                               //!< Critical cell radius
 
 //! Structure for Ewald summation
 struct Ewald {
-  vec3 K;                                                       //!< 3-D wave number vector
+  vect K;                                                       //!< 3-D wave number vector
   real REAL;                                                    //!< real part of wave
   real IMAG;                                                    //!< imaginary part of wave
 };

File include/vec.h

   }
   operator       T* ()       {return a;}                           // Type-casting (lvalue)
   operator const T* () const {return a;}                           // Type-casting (rvalue)
-  friend std::ostream &operator<<(std::ostream &s, const vec &b) { // Component-wise output stream
-    for_i s<<b[i]<<' ';
+  friend std::ostream &operator<<(std::ostream &s, const vec &a) { // Component-wise output stream
+    for_i s<<a[i]<<' ';
     return s;
   }
   friend T norm(const vec &b) {                                    // L2 norm squared
     for_i c+=b[i]*b[i];
     return c;
   }
-  friend vec min(const vec &b, const vec &c) {                     // Element-wise minimum
-    vec d;
-    for (int i=0; i<N; i++) d[i] = b[i] < c[i] ? b[i] : c[i];
-    return d;
-  }
-  friend vec max(const vec &b, const vec &c) {                     // Element-wise maximum
-    vec d;
-    for (int i=0; i<N; i++) d[i] = b[i] > c[i] ? b[i] : c[i];
-    return d;
-  }
 };
 
 #undef for_i

File include/vtk.h

   vtkPoints *points[maxGroups];
   vtkPoints *hexPoints;
 public:
-  void setDomain(const real r0, const vec3 x0) {
+  void setDomain(const real r0, const vect x0) {
     hexPoints = vtkPoints::New();
     hexPoints->SetNumberOfPoints(8);
     hexPoints->SetPoint(0, x0[0]-r0, x0[1]-r0, x0[2]-r0);
     points[Igroup]->SetNumberOfPoints(Npoints);
   }
 
-  void setPoints(const int Igroup, const vec3 X) {
+  void setPoints(const int Igroup, const vect X) {
     points[Igroup]->SetPoint(I[Igroup],X[0],X[1],X[2]);
     I[Igroup]++;
   }

File kernel/CPUCartesianLaplace.cxx

 #include "kernel.h"
 #undef KERNEL
 
+#ifndef __GXX_EXPERIMENTAL_CXX0X__
+#define constexpr const
+#endif
+
 template<int nx, int ny, int nz>
 struct Index {
-  static const int                I = Index<nx,ny+1,nz-1>::I + 1;
-  static const unsigned long long F = Index<nx,ny,nz-1>::F * nz;
+  static const int      I = Index<nx,ny+1,nz-1>::I + 1;
+  static constexpr real F = Index<nx,ny,nz-1>::F * nz;
 };
 
 template<int nx, int ny>
 struct Index<nx,ny,0> {
-  static const int                I = Index<nx+1,0,ny-1>::I + 1;
-  static const unsigned long long F = Index<nx,ny-1,0>::F * ny;
+  static const int      I = Index<nx+1,0,ny-1>::I + 1;
+  static constexpr real F = Index<nx,ny-1,0>::F * ny;
 };
 
 template<int nx>
 struct Index<nx,0,0> {
-  static const int                I = Index<0,0,nx-1>::I + 1;
-  static const unsigned long long F = Index<nx-1,0,0>::F * nx;
+  static const int      I = Index<0,0,nx-1>::I + 1;
+  static constexpr real F = Index<nx-1,0,0>::F * nx;
 };
 
 template<>
 struct Index<0,0,0> {
-  static const int                I = 0;
-  static const unsigned long long F = 1;
+  static const int      I = 0;
+  static constexpr real F = 1;
 };
 
 
 template<int n, int kx, int ky , int kz, int d>
 struct DerivativeTerm {
   static const int coef = 1 - 2 * n;
-  static inline real kernel(const Lset &C, const vec3 &dist) {
+  static inline real kernel(const Lset &C, const vect &dist) {
     return coef * dist[d] * C[Index<kx,ky,kz>::I];
   }
 };
 template<int n, int kx, int ky , int kz>
 struct DerivativeTerm<n,kx,ky,kz,-1> {
   static const int coef = 1 - n;
-  static inline real kernel(const Lset &C, const vec3&) {
+  static inline real kernel(const Lset &C, const vect&) {
     return coef * C[Index<kx,ky,kz>::I];
   }
 };
   static const int nextflag = 5 - (kz < nz || kz == 1);
   static const int dim = kz == (nz-1) ? -1 : 2;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,kz-1,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ny,kz-1,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,4> {
   static const int nextflag = 3 - (ny == 0);
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 3 - (ky < ny || ky == 1);
   static const int dim = ky == (ny-1) ? -1 : 1;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ky-1,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ky-1,nz,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,2> {
   static const int nextflag = 1 - (nx == 0);
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 1 - (kx < nx || kx == 1);
   static const int dim = kx == (nx-1) ? -1 : 0;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,kx-1,ny,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,kx-1,ny,nz,dim>::kernel(C,dist);
   }
 
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,0> {
-  static inline real loop(const Lset&, const vec3&) {
+  static inline real loop(const Lset&, const vect&) {
     return 0;
   }
 };
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct DerivativeSum<nx,ny,nz,kx,ky,0,5> {
-  static inline real loop(const Lset &C, const vec3 &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,0,4>::loop(C,dist);
   }
 };
 
 template<int nx, int ny, int nz>
 struct Terms {
-  static inline void power(Lset &C, const vec3 &dist) {
+  static inline void power(Lset &C, const vect &dist) {
     Terms<nx,ny+1,nz-1>::power(C,dist);
     C[Index<nx,ny,nz>::I] = C[Index<nx,ny,nz-1>::I] * dist[2] / nz;
   }
-  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx + ny + nz;
     Terms<nx,ny+1,nz-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,nz>::I] = DerivativeSum<nx,ny,nz>::loop(C,dist) / n * invR2;
 
 template<int nx, int ny>
 struct Terms<nx,ny,0> {
-  static inline void power(Lset &C, const vec3 &dist) {
+  static inline void power(Lset &C, const vect &dist) {
     Terms<nx+1,0,ny-1>::power(C,dist);
     C[Index<nx,ny,0>::I] = C[Index<nx,ny-1,0>::I] * dist[1] / ny;
   }
-  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx + ny;
     Terms<nx+1,0,ny-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,0>::I] = DerivativeSum<nx,ny,0>::loop(C,dist) / n * invR2;
 
 template<int nx>
 struct Terms<nx,0,0> {
-  static inline void power(Lset &C, const vec3 &dist) {
+  static inline void power(Lset &C, const vect &dist) {
     Terms<0,0,nx-1>::power(C,dist);
     C[Index<nx,0,0>::I] = C[Index<nx-1,0,0>::I] * dist[0] / nx;
   }
-  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx;
     Terms<0,0,nx-1>::derivative(C,dist,invR2);
     C[Index<nx,0,0>::I] = DerivativeSum<nx,0,0>::loop(C,dist) / n * invR2;
 
 template<>
 struct Terms<0,0,0> {
-  static inline void power(Lset&, const vec3&) {}
-  static inline void derivative(Lset&, const vec3&, const real&) {}
+  static inline void power(Lset&, const vect&) {}
+  static inline void derivative(Lset&, const vect&, const real&) {}
   static inline void scale(Lset&) {}
 };
 
 };
 
 template<int PP>
-inline void getCoef(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef(Lset &C, const vect &dist, real &invR2, const real &invR) {
   C[0] = invR;
   Terms<0,0,PP>::derivative(C,dist,invR2);
   Terms<0,0,PP>::scale(C);
 }
 
 template<>
-inline void getCoef<1>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<1>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   C[0] = invR;
   invR2 = -invR2;
   real x = dist[0], y = dist[1], z = dist[2];
 }
 
 template<>
-inline void getCoef<2>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<2>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<1>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<3>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<3>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<2>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<4>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<4>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<3>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<5>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<5>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<4>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<6>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
+inline void getCoef<6>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<5>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 void Kernel<Laplace>::P2M(C_iter Ci) {
   real Rmax = 0;
 //  setCenter(Ci);
-  for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vec3 dist = Ci->X - B->X;
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
+    vect dist = Ci->X - B->X;
     real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
     Lset M;
   real Rmax = Ci->RMAX;
 //  setCenter(Ci);
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
-    vec3 dist = Ci->X - Cj->X;
+    vect dist = Ci->X - Cj->X;
     real R = std::sqrt(norm(dist)) + Cj->RCRIT;
     if( R > Rmax ) Rmax = R;
     Mset M;
 
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
-  vec3 dist = Ci->X - Cj->X - Xperiodic;
+  vect dist = Ci->X - Cj->X - Xperiodic;
   real invR2 = 1 / norm(dist);
-  real invR  = Ci->M[0] * Cj->M[0] * std::sqrt(invR2);
+  real invR  = Cj->M[0] * std::sqrt(invR2);
   Lset C;
   getCoef<P>(C,dist,invR2,invR);
   sumM2L<P>(Ci->L,C,Cj->M);
 
 template<>
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
-  for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) {
-    vec3 dist = B->X - Cj->X - Xperiodic;
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
+    vect dist = B->X - Cj->X - Xperiodic;
     real invR2 = 1 / norm(dist);
     real invR  = Cj->M[0] * std::sqrt(invR2);
     Lset C;
 template<>
 void Kernel<Laplace>::L2L(C_iter Ci) const {
   C_iter Cj = Ci0 + Ci->PARENT;
-  vec3 dist = Ci->X - Cj->X;
+  vect dist = Ci->X - Cj->X;
   Lset C;
   C[0] = 1;
   Terms<0,0,P>::power(C,dist);
-  Ci->L /= Ci->M[0];
+
   Ci->L += Cj->L;
   for( int i=1; i<LTERM; ++i ) Ci->L[0] += C[i] * Cj->L[i];
   Downward<0,0,P-1>::L2L(Ci->L,C,Cj->L);
 
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
-  for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vec3 dist = B->X - Ci->X;
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
+    vect dist = B->X - Ci->X;
     Lset C, L;
     C[0] = 1;
     Terms<0,0,P>::power(C,dist);
 
     L = Ci->L;
-    B->TRG /= B->SRC;
     B->TRG[0] += L[0];
     B->TRG[1] += L[1];
     B->TRG[2] += L[2];

File kernel/CPUEvaluator.cxx

   Xperiodic = 0;                                                // Set periodic coordinate offset
   Cells cells;                                                  // Cells to put target and source bodies
   cells.resize(2);                                              // Resize cells to put target and source bodies
-  cells[0].BODY = ibodies.begin();                              // Iterator of first target leaf
-  cells[0].NDBODY = ibodies.size();                             // Number of target leafs
-  cells[1].BODY = jbodies.begin();                              // Iterator of first source leaf
-  cells[1].NDBODY = jbodies.size();                             // Number of source leafs
+  cells[0].LEAF = ibodies.begin();                              // Iterator of first target leaf
+  cells[0].NDLEAF = ibodies.size();                             // Number of target leafs
+  cells[1].LEAF = jbodies.begin();                              // Iterator of first source leaf
+  cells[1].NDLEAF = jbodies.size();                             // Number of source leafs
   C_iter Ci = cells.begin(), Cj = cells.begin()+1;              // Iterator of target and source cells
   P2P(Ci,Cj);                                                   // Perform P2P kernel
 }
               Xperiodic[0] = ix * 2 * R0;                       //       Coordinate offset for x periodic direction
               Xperiodic[1] = iy * 2 * R0;                       //       Coordinate offset for y periodic direction
               Xperiodic[2] = iz * 2 * R0;                       //       Coordinate offset for z periodic direction
-              evalP2P(Ci,Cj);                                   //       Perform P2P kernel
+              P2P(Ci,Cj);                                       //       Perform P2P kernel
             }                                                   //      Endif for periodic flag
           }                                                     //     End loop over x periodic direction
         }                                                       //    End loop over y periodic direction
   cells.resize(2);                                              // Two artificial cells
   C_iter Ci = cells.begin(), Cj = cells.begin()+1;              // Artificial target & source cell
   Ci->X = 0;                                                    // Set coordinates of target cell
-  Ci->NDBODY = 10;                                              // Number of leafs in target cell
-  Ci->BODY = ibodies.begin();                                   // Leaf iterator in target cell
+  Ci->NDLEAF = 10;                                              // Number of leafs in target cell
+  Ci->LEAF = ibodies.begin();                                   // Leaf iterator in target cell
   Cj->X = 1;                                                    // Set coordinates of source cell
-  Cj->NDBODY = 1000;                                            // Number of leafs in source cell
-  Cj->BODY = jbodies.begin();                                   // Leaf iterator in source cell
+  Cj->NDLEAF = 1000;                                            // Number of leafs in source cell
+  Cj->LEAF = jbodies.begin();                                   // Leaf iterator in source cell
   startTimer("P2P kernel");                                     // Start timer
   for( int i=0; i!=1; ++i ) P2P(Ci,Cj);                         // Perform P2P kernel
   timeP2P = stopTimer("P2P kernel") / 10000;                    // Stop timer

File kernel/CPUEwaldLaplace.cxx

 
 template<>
 void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real part on CPU
-  for( B_iter Bi=Ci->BODY; Bi!=Ci->BODY+Ci->NDBODY; ++Bi ) {    // Loop over target bodies
-    for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
-      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
+    for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
+      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       for( int d=0; d<3; d++ ) {                                //   Loop over dimensions
         if( dist[d] < -R0 ) dist[d] += 2 * R0;                  //    Wrap domain so that target is always at
         if( dist[d] >= R0 ) dist[d] -= 2 * R0;                  //    the center of a [-R0,R0]^3 source cube
     for( int d=0; d<3; d++ ) B->TRG[d+1] *= scale;
   }
 
-  vec3 dipole = 0;
+  vect dipole = 0;
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
     dipole += (B->X - R0) * B->SRC;
   }

File kernel/CPUP2P.cxx

 #undef KERNEL
 
 template<>
-void Kernel<Laplace>::P2P(C_iter C) const {
-  B_iter B = C->BODY;
-  int n = C->NDBODY;
-  int i = 0;
-  for ( ; i<n; i++) {
-    real pot = 0;
-    vec3 acc = 0;
-    for (int j=i+1; j<n; j++) {
-      vec3 dX = B[i].X - B[j].X;
-      real R2 = norm(dX) + EPS2;
-      if (R2 != 0) {
-        real invR2 = 1.0 / R2;
-        real invR = B[i].SRC * B[j].SRC * sqrtf(invR2);
-        dX *= invR2 * invR;
-        pot += invR;
-        acc += dX;
-        B[j].TRG[0] += invR;
-        B[j].TRG[1] += dX[0];
-        B[j].TRG[2] += dX[1];
-        B[j].TRG[3] += dX[2];
-      }
-    }
-    B[i].TRG[0] += pot;
-    B[i].TRG[1] -= acc[0];
-    B[i].TRG[2] -= acc[1];
-    B[i].TRG[3] -= acc[2];
-  }
-}
-
-template<>
 void Kernel<Laplace>::P2P(C_iter Ci, C_iter Cj) const {         // Laplace P2P kernel on CPU
-#if 1
-  B_iter Bi = Ci->BODY;
-  B_iter Bj = Cj->BODY;
-  int ni = Ci->NDBODY;
-  int nj = Cj->NDBODY;
-  int i = 0;
-  for ( ; i<ni; i++) {
-    real pot = 0;
-    vec3 acc = 0;
-    for (int j=0; j<nj; j++) {
-      vec3 dX = Bi[i].X - Bj[j].X - Xperiodic;
-      real R2 = norm(dX) + EPS2;
-      if (R2 != 0) {
-        real invR2 = 1.0f / R2;
-        real invR = Bi[i].SRC * Bj[j].SRC * sqrtf(invR2);
-        dX *= invR2 * invR;
-        pot += invR;
-        acc += dX;
-      }
-    }
-    Bi[i].TRG[0] += pot;
-    Bi[i].TRG[1] -= acc[0];
-    Bi[i].TRG[2] -= acc[1];
-    Bi[i].TRG[3] -= acc[2];
-  }
-#else
-  for( B_iter Bi=Ci->BODY; Bi!=Ci->BODY+Ci->NDBODY; ++Bi ) {    // Loop over target bodies
+#ifndef SPARC_SIMD
+  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
     real P0 = 0;                                                //  Initialize potential
-    vec3 F0 = 0;                                                //  Initialize force
-    for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
-      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+    vect F0 = 0;                                                //  Initialize force
+    for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
+      vect dist = Bi->X - Bj->X -Xperiodic;                     //   Distance vector from source to target
       real R2 = norm(dist) + EPS2;                              //   R^2
       real invR2 = 1.0 / R2;                                    //   1 / R^2
       if( R2 == 0 ) invR2 = 0;                                  //   Exclude self interaction
-      real invR = Bi->SRC * Bj->SRC * std::sqrt(invR2);         //   potential
+      real invR = Bj->SRC * std::sqrt(invR2);                   //   potential
       dist *= invR2 * invR;                                     //   force
       P0 += invR;                                               //   accumulate potential
       F0 += dist;                                               //   accumulate force
     Bi->TRG[2] -= F0[1];                                        //  y component of force
     Bi->TRG[3] -= F0[2];                                        //  z component of force
   }                                                             // End loop over target bodies
+#else
+  real (* cbi)[10] =  (real (*)[10])(&(Ci->LEAF->IBODY));
+  real (* cbj)[10] =  (real (*)[10])(&(Cj->LEAF->IBODY));
+  real xp[3] = {Xperiodic[0],Xperiodic[1],Xperiodic[2]};
+  int ni = Ci->NDLEAF;
+  int nj = Cj->NDLEAF;
+  int i,j;
+#pragma loop norecurrence
+  for(i=0;i<ni;i++){
+    for(j=0;j<nj;j++){
+      real dist_x = cbi[i][2] - cbj[j][2] - xp[0];
+      real dist_y = cbi[i][3] - cbj[j][3] - xp[1];
+      real dist_z = cbi[i][4] - cbj[j][4] - xp[2];
+      real R2 = EPS2+dist_x*dist_x+dist_y*dist_y+dist_z*dist_z;
+      real invR = 1.0/sqrt(R2);
+      if( R2 == 0 ) invR = 0;
+      real invR3 = cbj[j][5] * invR * invR * invR;
+      cbi[i][6] += cbj[j][5] * invR;
+      cbi[i][7] -= dist_x * invR3;
+      cbi[i][8] -= dist_y * invR3;
+      cbi[i][9] -= dist_z * invR3;
+    }
+  }
 #endif
 }
 
 template<>
 void Kernel<VanDerWaals>::P2P(C_iter Ci, C_iter Cj) const {     // Van der Waals P2P kernel on CPU
-  for( B_iter Bi=Ci->BODY; Bi!=Ci->BODY+Ci->NDBODY; ++Bi ) {    // Loop over target bodies
+  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
     int atypei = int(Bi->SRC);                                  //  Atom type of target
-    for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
+    for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
       int atypej = int(Bj->SRC);                                //   Atom type of source
-      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       real R2 = norm(dist);                                     //   R squared
       if( R2 != 0 ) {                                           //   Exclude self interaction
         real rs = RSCALE[atypei*ATOMS+atypej];                  //    r scale

File kernel/CPUSphericalLaplace.cxx

 
 namespace{
 //! Get r,theta,phi from x,y,z
-void cart2sph(real& r, real& theta, real& phi, vec3 dist) {
+void cart2sph(real& r, real& theta, real& phi, vect dist) {
   r = sqrt(norm(dist)) * (1 + EPS);                           // r = sqrt(x^2 + y^2 + z^2)
   if( r < EPS ) {                                             // If r == 0
     theta = 0;                                                //  theta can be anything so we set it to 0
 void Kernel<Laplace>::P2M(C_iter Cj) {
   real Rmax = 0;
   complex Ynm[4*P*P], YnmTheta[4*P*P];
-  for( B_iter B=Cj->BODY; B!=Cj->BODY+Cj->NCBODY; ++B ) {
-    vec3 dist = B->X - Cj->X;
+  for( B_iter B=Cj->LEAF; B!=Cj->LEAF+Cj->NCLEAF; ++B ) {
+    vect dist = B->X - Cj->X;
     real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
     real rho, alpha, beta;
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   real Rmax = Ci->RMAX;
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
-    vec3 dist = Ci->X - Cj->X;
+    vect dist = Ci->X - Cj->X;
     real R = std::sqrt(norm(dist)) + Cj->RCRIT;
     if( R > Rmax ) Rmax = R;
     real rho, alpha, beta;
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
   complex Ynm[4*P*P], YnmTheta[4*P*P];
-  vec3 dist = Ci->X - Cj->X - Xperiodic;
+  vect dist = Ci->X - Cj->X - Xperiodic;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
   evalLocal(rho,alpha,beta,Ynm,YnmTheta);
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
   const complex I(0.,1.);                                       // Imaginary unit
   complex Ynm[4*P*P], YnmTheta[4*P*P];
-  for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) {
-    vec3 dist = B->X - Cj->X - Xperiodic;
-    vec3 spherical = 0;
-    vec3 cartesian = 0;
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
+    vect dist = B->X - Cj->X - Xperiodic;
+    vect spherical = 0;
+    vect cartesian = 0;
     real r, theta, phi;
     cart2sph(r,theta,phi,dist);
     evalLocal(r,theta,phi,Ynm,YnmTheta);
   const complex I(0.,1.);
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   C_iter Cj = Ci0 + Ci->PARENT;
-  vec3 dist = Ci->X - Cj->X;
+  vect dist = Ci->X - Cj->X;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
   evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
 void Kernel<Laplace>::L2P(C_iter Ci) const {
   const complex I(0.,1.);                                       // Imaginary unit
   complex Ynm[4*P*P], YnmTheta[4*P*P];
-  for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vec3 dist = B->X - Ci->X;
-    vec3 spherical = 0;
-    vec3 cartesian = 0;
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
+    vect dist = B->X - Ci->X;
+    vect spherical = 0;
+    vect cartesian = 0;
     real r, theta, phi;
     cart2sph(r,theta,phi,dist);
     evalMultipole(r,theta,phi,Ynm,YnmTheta);

File kernel/GPUEvaluator.cxx

   for( MC_iter M=sourceSize.begin(); M!=sourceSize.end(); ++M ) {// Loop over source map
     C_iter Cj = M->first;                                       //  Set source cell
     sourceBegin[Cj] = sourceHost.size() / 4;                    //  Key : iterator, Value : offset of source leafs
-    for( B_iter B=Cj->BODY; B!=Cj->BODY+Cj->NDBODY; ++B ) {     //  Loop over leafs in source cell
+    for( B_iter B=Cj->LEAF; B!=Cj->LEAF+Cj->NDLEAF; ++B ) {     //  Loop over leafs in source cell
       sourceHost.push_back(B->X[0]);                            //   Copy x position to GPU buffer
       sourceHost.push_back(B->X[1]);                            //   Copy y position to GPU buffer
       sourceHost.push_back(B->X[2]);                            //   Copy z position to GPU buffer
   int key = 0;                                                  // Initialize key to range of coefs in source cells
   for( C_iter Ci=CiB; Ci!=CiE; ++Ci ) {                         // Loop over target cells
     if( !lists[Ci-Ci0].empty() ) {                              //  If the interation list is not empty
-      int blocks = (Ci->NDBODY - 1) / THREADS + 1;              //   Number of thread blocks needed for this target cell
+      int blocks = (Ci->NDLEAF - 1) / THREADS + 1;              //   Number of thread blocks needed for this target cell
       for( int i=0; i!=blocks; ++i ) {                          //   Loop over thread blocks
         keysHost.push_back(key);                                //    Save key to range of leafs in source cells
       }                                                         //   End loop over thread blocks
         rangeHost.push_back(flags[Ci-Ci0][Cj]);                 //    Set periodic image flag of source cell
       }                                                         //   End loop over interaction list
       targetBegin[Ci] = targetHost.size() / 4;                  //   Key : iterator, Value : offset of target leafs
-      for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) {   //   Loop over leafs in target cell
+      for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {   //   Loop over leafs in target cell
         targetHost.push_back(B->X[0]);                          //    Copy x position to GPU buffer
         targetHost.push_back(B->X[1]);                          //    Copy y position to GPU buffer
         targetHost.push_back(B->X[2]);                          //    Copy z position to GPU buffer
         targetHost.push_back(B->SRC);                           //    Copy target value to GPU buffer
       }                                                         //   End loop over leafs
-      int numPad = blocks * THREADS - Ci->NDBODY;               //   Number of elements to pad in target GPU buffer
+      int numPad = blocks * THREADS - Ci->NDLEAF;               //   Number of elements to pad in target GPU buffer
       for( int i=0; i!=numPad; ++i ) {                          //   Loop over elements to pad
         targetHost.push_back(0);                                //    Pad x position in GPU buffer
         targetHost.push_back(0);                                //    Pad y position in GPU buffer
   for( C_iter Ci=CiB; Ci!=CiE; ++Ci ) {                         // Loop over target cells
     if( !lists[Ci-Ci0].empty() ) {                              //  If the interation list is not empty
       int begin = targetBegin[Ci];                              //   Offset of target leafs
-        for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) { //    Loop over target bodies
-          B->TRG[0] += targetHost[4*(begin+B-Ci->BODY)+0];      //     Copy 1st target value from GPU buffer
-          B->TRG[1] += targetHost[4*(begin+B-Ci->BODY)+1];      //     Copy 2nd target value from GPU buffer
-          B->TRG[2] += targetHost[4*(begin+B-Ci->BODY)+2];      //     Copy 3rd target value from GPU buffer
-          B->TRG[3] += targetHost[4*(begin+B-Ci->BODY)+3];      //     Copy 4th target value from GPU buffer
+        for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) { //    Loop over target bodies
+          B->TRG[0] += targetHost[4*(begin+B-Ci->LEAF)+0];      //     Copy 1st target value from GPU buffer
+          B->TRG[1] += targetHost[4*(begin+B-Ci->LEAF)+1];      //     Copy 2nd target value from GPU buffer
+          B->TRG[2] += targetHost[4*(begin+B-Ci->LEAF)+2];      //     Copy 3rd target value from GPU buffer
+          B->TRG[3] += targetHost[4*(begin+B-Ci->LEAF)+3];      //     Copy 4th target value from GPU buffer
         }                                                       //    End loop over target bodies
       lists[Ci-Ci0].clear();                                    //   Clear interaction list
     }                                                           //  End if for empty interation list
   for( int icall=0; icall!=numIcall; ++icall ) {                // Loop over icall
     B_iter Bi0 = ibodies.begin()+ioffset;                       //  Set target bodies begin iterator
     B_iter BiN = ibodies.begin()+std::min(ioffset+MAXBODY,int(ibodies.size()));// Set target bodies end iterator
-    cells[0].BODY = Bi0;                                        //  Iterator of first target leaf
-    cells[0].NDBODY = BiN-Bi0;                                  //  Number of target leafs
+    cells[0].LEAF = Bi0;                                        //  Iterator of first target leaf
+    cells[0].NDLEAF = BiN-Bi0;                                  //  Number of target leafs
     int joffset = 0;                                            //  Initialize offset for jcall loops
     for( int jcall=0; jcall!=numJcall; ++jcall ) {              //  Loop over jcall
       B_iter Bj0 = jbodies.begin()+joffset;                     //  Set source bodies begin iterator
       B_iter BjN = jbodies.begin()+std::min(joffset+MAXBODY,int(jbodies.size()));// Set source bodies end iterator
-      cells[1].BODY = Bj0;                                      //  Iterator of first source leaf
-      cells[1].NDBODY = BjN-Bj0;                                //  Number of source leafs
+      cells[1].LEAF = Bj0;                                      //  Iterator of first source leaf
+      cells[1].NDLEAF = BjN-Bj0;                                //  Number of source leafs
       C_iter Ci = cells.begin(), Cj = cells.begin()+1;          //  Iterator of target and source cells
       if( onCPU ) {                                             //  If calculation is to be done on CPU
         Xperiodic = 0;                                          //   Set periodic coordinate offset
       if( Ci->NCHILD == 0 ) {                                   //   If cell is a twig
         listP2M[Ci-Ci0].push_back(Ci);                          //    Push source cell into P2M interaction list
         flagP2M[Ci-Ci0][Ci] |= Icenter;                         //    Flip bit of periodic image flag
-        sourceSize[Ci] = Ci->NDBODY;                            //    Key : iterator, Value : number of leafs
+        sourceSize[Ci] = Ci->NDLEAF;                            //    Key : iterator, Value : number of leafs
       }                                                         //   End loop over cells topdown
     }                                                           //  End loop over source map
     stopTimer("Get list");                                      //  Stop timer
     for( C_iter Ci=CiB; Ci!=CiE; ++Ci ) {                       //  Loop over target cells
       for( LC_iter L=listP2P[Ci-Ci0].begin(); L!=listP2P[Ci-Ci0].end(); ++L ) {//  Loop over interaction list
         C_iter Cj = *L;                                         //    Set source cell
-        sourceSize[Cj] = Cj->NDBODY;                            //    Key : iterator, Value : number of leafs
+        sourceSize[Cj] = Cj->NDLEAF;                            //    Key : iterator, Value : number of leafs
       }                                                         //   End loop over interaction list
     }                                                           //  End loop over target cells
     stopTimer("Get list");                                      //  Stop timer
     for( C_iter Ci=CiB; Ci!=CiE; ++Ci ) {                       //  Loop over target cells
       for( LC_iter L=listP2P[Ci-Ci0].begin(); L!=listP2P[Ci-Ci0].end(); ++L ) {//  Loop over interaction list
         C_iter Cj = *L;                                         //    Set source cell
-        sourceSize[Cj] = Cj->NDBODY;                            //    Key : iterator, Value : number of leafs
+        sourceSize[Cj] = Cj->NDLEAF;                            //    Key : iterator, Value : number of leafs
       }                                                         //   End loop over interaction list
     }                                                           //  End loop over target cells
     stopTimer("Get list");                                      //  Stop timer
   Ci0 = icells.begin();                                         // Set global begin iterator for s