Commits

Rio Yokota committed 6536d62

Comparable switch for exafmm-dev.

  • Participants
  • Parent commits 9d09076

Comments (0)

Files changed (19)

include/bottomup.h

 */
 #ifndef bottomup_h
 #define bottomup_h
+#if COMPARE
 #include "topdown.h"
+#else
+#include "topdown2.h"
+#endif
 
 //! Bottomup tree constructor
 template<Equation equation>

include/evaluator.h

       C = cellStack.top();                                      //  Get cell from top of stack
       cellStack.pop();                                          //  Pop traversal stack
       for( C_iter Cj=Cj0+C->CHILD; Cj!=Cj0+C->CHILD+C->NCHILD; ++Cj ) {// Loop over cell's children
-        vect dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
+        vec3 dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
         real Rq = std::sqrt(norm(dX));                          //   Scalar distance
         if( Rq * THETA < Ci->R + Cj->R && Cj->NCHILD == 0 ) {   //   If twigs are close
           evalEwaldReal(Ci,Cj);                                 //    Ewald real part
 protected:
 //! Dual tree traversal for a single pair of cells
   void traverse(C_iter Ci, C_iter Cj) {
-    vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
+    vec3 dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
     real R2 = norm(dX);                                         // Scalar distance squared
-#if DUAL
-    {                                                           // Dummy bracket
-#else
-    if (Ci->RCRIT != Cj->RCRIT) {                               // If cell is not at the same level
-      splitCell(Ci, Cj);                                        //  Split cell and call function recursively for child
-    } else {                                                    // If we don't care if cell is not at the same level
-#endif
-//      if (R2 > (Ci->RCRIT+Cj->RCRIT)*(Ci->RCRIT+Cj->RCRIT)) {   //  If distance is far enough
-      if (R2 * THETA * THETA > (Ci->R+Cj->R)*(Ci->R+Cj->R)) {   //  If distance is far enough
-        approximate(Ci, Cj);                                    //   Use approximate kernels
-      } else if (Ci->NCHILD == 0 && Cj->NCHILD == 0) {          //  Else if both cells are bodies
-        if (Cj->NCBODY == 0) {                                  //   If the bodies weren't sent from remote node
-          approximate(Ci, Cj);                                  //    Use approximate kernels
-        } else {                                                //   Else if the bodies were sent
-          if (Ci == Cj) {                                       //    If source and target are same
-            P2P(Ci);                                            //     P2P kernel for single cell
-          } else {                                              //    Else if source and target are different
-            P2P(Ci, Cj);                                        //     P2P kernel for pair of cells
-          }                                                     //    End if for same source and target
-          NP2P++;                                               //    Increment P2P counter
-        }                                                       //   End if for bodies
-      } else {                                                  //  Else if cells are close but not bodies
-        splitCell(Ci, Cj);                                      //   Split cell and call function recursively for child
-      }                                                         //  End if for multipole acceptance
-    }                                                           // End if for same level cells
+    if (R2 > (Ci->RCRIT+Cj->RCRIT)*(Ci->RCRIT+Cj->RCRIT)) {   //  If distance is far enough
+      approximate(Ci, Cj);                                      //   Use approximate kernels
+    } else if (Ci->NCHILD == 0 && Cj->NCHILD == 0) {            //  Else if both cells are bodies
+      if (Cj->NCBODY == 0) {                                    //   If the bodies weren't sent from remote node
+        approximate(Ci, Cj);                                    //    Use approximate kernels
+      } else {                                                  //   Else if the bodies were sent
+        if (Ci == Cj) {                                         //    If source and target are same
+          P2P(Ci);                                              //     P2P kernel for single cell
+        } else {                                                //    Else if source and target are different
+          P2P(Ci, Cj);                                          //     P2P kernel for pair of cells
+        }                                                       //    End if for same source and target
+        NP2P++;                                                 //    Increment P2P counter
+      }                                                         //   End if for bodies
+    } else {                                                    //  Else if cells are close but not bodies
+      splitCell(Ci, Cj);                                        //   Split cell and call function recursively for child
+    }                                                           //  End if for multipole acceptance
   }
 
 //! Get range of periodic images
 
 //! Use multipole acceptance criteria to determine whether to approximate, do P2P, or subdivide
   void interact(C_iter Ci, C_iter Cj, PairQueue &pairQueue) {
-    vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
+    vec3 dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
     real Rq = std::sqrt(norm(dX));                              // Scalar distance
     if( Rq * THETA > Ci->R + Cj->R ) {                          // If distance if far enough
       approximate(Ci,Cj);                                       //  Use approximate kernels, e.g. M2L, M2P
 
 //! Dual tree traversal
   void traverse(Cells &cells, Cells &jcells) {
+#if COMPARE
+    C_iter root = cells.begin();                                // Iterator for root target cell
+    C_iter jroot = jcells.begin();                              // Iterator for root source cell
+#else
     C_iter root = cells.end() - 1;                              // Iterator for root target cell
     C_iter jroot = jcells.end() - 1;                            // Iterator for root source cell
+#endif
     if( IMAGES != 0 ) {                                         // If periodic boundary condition
       jroot = jcells.end() - 1 - 26 * 27 * (IMAGES - 1);        //  The root is not at the end
     }                                                           // Endif for periodic boundary condition
   real *Anm;                                                    //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
   complex *Cnm;                                                 //!< M2L translation matrix \f$ C_{jn}^{km} \f$
 public:
-  vect X0;                                                      //!< Center of root cell
+  vec3 X0;                                                      //!< Center of root cell
   real R0;                                                      //!< Radius of root cell
 
 private:
-  real getBmax(vect const&X, C_iter C) const {
+  real getBmax(vec3 const&X, C_iter C) const {
     real rad = C->R;
     real dx = rad+std::abs(X[0]-C->X[0]);
     real dy = rad+std::abs(X[1]-C->X[1]);
 protected:
   void setCenter(C_iter C) const {
     real m = 0;
-    vect X = 0;
+    vec3 X = 0;
     for( B_iter B=C->BODY; B!=C->BODY+C->NCBODY; ++B ) {
       m += std::abs(B->SRC);
       X += B->X * std::abs(B->SRC);
   KernelBase &operator=(const KernelBase) {return *this;}
 
 //! Set center of root cell
-  void setX0(vect x0) {X0 = x0;}
+  void setX0(vec3 x0) {X0 = x0;}
 //! Set radius of root cell
   void setR0(real r0) {R0 = r0;}
 
 //! Get center of root cell
-  vect getX0() const {return X0;}
+  vec3 getX0() const {return X0;}
 //! Get radius of root cell
   real getR0() const {return R0;}
 
 //! Set center and size of root cell
-  void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
-    vect xmin,xmax;                                             // Min,Max of domain
+  void setDomain(Bodies &bodies, vec3 x0=0, real r0=M_PI) {
+    vec3 xmin,xmax;                                             // Min,Max of domain
     B_iter B = bodies.begin();                                  // Reset body iterator
     xmin = xmax = B->X;                                         // Initialize xmin,xmax
     for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
         else if(B->X[d] > xmax[d]) xmax[d] = B->X[d];           //   Determine xmax
       }                                                         //  End loop over each dimension
     }                                                           // End loop over bodies
+    for (int d=0; d<3; d++) x0[d] = (xmax[d] + xmin[d]) / 2;    // Calculate center of domain
+    r0 = 0;                                                     // Initialize localRadius
+    for (int d=0; d<3; d++) {                                   // Loop over dimensions
+      r0 = std::max(x0[d] - xmin[d], r0);                       // Calculate min distance from center
+      r0 = std::max(xmax[d] - x0[d], r0);                       // Calculate max distance from center 
+    }                                                           // End loop over dimensions
+    r0 *= 1.00001;                                              // Add some leeway to radius
 /*
     if( xmin[0] < x0[0]-r0 || x0[0]+r0 < xmax[0]                //  Check for outliers in x direction
      || xmin[1] < x0[1]-r0 || x0[1]+r0 < xmax[1]                //  Check for outliers in y direction

include/parallelfmm.h

   std::vector<int>    sendCellDsp;                              //!< Vector of cell send displacements
   std::vector<int>    recvCellCnt;                              //!< Vector of cell recv counts
   std::vector<int>    recvCellDsp;                              //!< Vector of cell recv displacements
-  std::vector<vect>   xminAll;                                  //!< Buffer for gathering XMIN
-  std::vector<vect>   xmaxAll;                                  //!< Buffer for gathering XMAX
+  std::vector<vec3>   xminAll;                                  //!< Buffer for gathering XMIN
+  std::vector<vec3>   xmaxAll;                                  //!< Buffer for gathering XMAX
 
   JBodies sendBodies;                                           //!< Send buffer for bodies
   JBodies recvBodies;                                           //!< Recv buffer for bodies
   }
 
 //! Get boundries of domains on other processes
-  void getOtherDomain(vect &xmin, vect &xmax, int l) {
+  void getOtherDomain(vec3 &xmin, vec3 &xmax, int l) {
     startTimer("Get domain");                                   // Start timer
     MPI_Datatype MPI_TYPE = getType(XMIN[l][0]);                // Get MPI data type
-    vect send[2],recv[2];                                       // Send and recv buffer
+    vec3 send[2],recv[2];                                       // Send and recv buffer
     MPI_Request req;                                            // MPI requests
     send[0] = send[1] = XMIN[l];                                // Set XMIN into send buffer
     recv[0] = recv[1] = 0;                                      // Initialize recv buffer
   }
 
 //! Get disatnce to other domain
-  real getDistance(C_iter C, vect xmin, vect xmax) {
-    vect dist;                                                  // Distance vector
+  real getDistance(C_iter C, vec3 xmin, vec3 xmax) {
+    vec3 dist;                                                  // Distance vector
     for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
       dist[d] = (C->X[d] + Xperiodic[d] > xmax[d])*             //  Calculate the distance between cell C and
                 (C->X[d] + Xperiodic[d] - xmax[d])+             //  the nearest point in domain [xmin,xmax]^3
   }
 
 //! Determine which cells to send
-  void getLET(C_iter C0, C_iter C, vect xmin, vect xmax) {
+  void getLET(C_iter C0, C_iter C, vec3 xmin, vec3 xmax) {
     int level = int(log(MPISIZE-1) / M_LN2 / 3) + 1;            // Level of local root cell
     if( MPISIZE == 1 ) level = 0;                               // Account for serial case
     for( int i=0; i!=C->NCHILD; i++ ) {                         // Loop over child cells
 
 //! Communicate cells in the local essential tree
   void commCells(Bodies &bodies, Cells &cells) {
-    vect xmin = 0, xmax = 0;                                    // Initialize domain boundaries
+    vec3 xmin = 0, xmax = 0;                                    // Initialize domain boundaries
     Cells twigs,sticks;                                         // Twigs and sticks are special types of cells
 
 #if 1

include/partition.h

 
 protected:
   int LEVEL;                                                    //!< Level of the MPI process binary tree
-  std::vector<vect> XMIN;                                       //!< Minimum position vector of bodies
-  std::vector<vect> XMAX;                                       //!< Maximum position vector of bodies
+  std::vector<vec3> XMIN;                                       //!< Minimum position vector of bodies
+  std::vector<vec3> XMAX;                                       //!< Maximum position vector of bodies
   int nprocs[64][2];                                            //!< Number of processes in the two split groups
   int offset[64][2];                                            //!< Offset of body in the two split groups
   int  color[64][3];                                            //!< Color of hypercube communicators
   ~Partition() {}
 
 //! Set bounds of domain to be partitioned
-  void setGlobDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
+  void setGlobDomain(Bodies &bodies, vec3 x0=0, real r0=M_PI) {
     numCells1D = 1 << getMaxLevel(bodies);                      // Set initial number of bodies
     B_iter B = bodies.begin();                                  // Reset body iterator
     XMIN[0] = XMAX[0] = B->X;                                   // Initialize xmin,xmax
         else if(B->X[d] > XMAX[0][d]) XMAX[0][d] = B->X[d];     //   Determine xmax
       }                                                         //  End loop over each dimension
     }                                                           // End loop over bodies
-    vect X;                                                     // Recv buffer
+    vec3 X;                                                     // Recv buffer
     MPI_Allreduce(XMAX[0],X,3,MPI_TYPE,MPI_MAX,MPI_COMM_WORLD); // Reduce global maximum
     XMAX[0] = X;                                                // Get data from buffer
     MPI_Allreduce(XMIN[0],X,3,MPI_TYPE,MPI_MIN,MPI_COMM_WORLD); // Reduce global minimum

include/serialfmm.h

 
 //! Topdown tree constructor interface. Input: bodies, Output: cells
   void topdown(Bodies &bodies, Cells &cells) {
+#if COMPARE
+    TopDown<equation>::growTree(bodies);
+    TopDown<equation>::linkTree(cells);
+    TopDown<equation>::upwardPass(cells);
+#else
     TopDown<equation>::grow(bodies);                            // Grow tree structure topdown
 
     TopDown<equation>::setIndex();                              // Set index of cells
       for( int i=1; i<MTERM; ++i ) C->M[i] /= C->M[0];
     }
 #endif
+#endif
   }
 
 //! Bottomup tree constructor interface. Input: bodies, Output: cells

include/topdown.h

 class TopDown : public TreeStructure<equation> {
 public:
   using Kernel<equation>::printNow;                             //!< Switch to print timings
+  using Kernel<equation>::stringLength;                         //!< Max length of event name
   using Kernel<equation>::startTimer;                           //!< Start timer for given event
   using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
   using Kernel<equation>::X0;                                   //!< Center of root cell
   using Kernel<equation>::R0;                                   //!< Radius of root cell
+  using Kernel<equation>::Ci0;                                  //!< Begin iterator for target cells
 
 private:
-//! Nodes are primitive cells
-  struct Node {
-    int LEVEL;                                                  //!< Level of node
-    int ICHILD;                                                 //!< Flag of empty child nodes
-    int NBODY;                                                  //!< Number of leafs in node
-    bigint I;                                                   //!< Cell index
-    bigint CHILD[8];                                            //!< Iterator offset of child nodes
-    B_iter BODY[NCRIT];                                         //!< Iterator for leafs
-    vect X;                                                     //!< Node center
-    real R;                                                     //!< Node radius
+//! Binary tree is used for counting number of bodies with a recursive approach
+  struct BinaryTreeNode {
+    ivec8            NBODY;                                     //!< Number of descendant bodies
+    BinaryTreeNode * LEFT;                                      //!< Pointer to left child
+    BinaryTreeNode * RIGHT;                                     //!< Pointer to right child
+    BinaryTreeNode * BEGIN;                                     //!< Pointer to begining of memory space
+    BinaryTreeNode * END;                                       //!< Pointer to end of memory space
   };
-  std::vector<Node> nodes;                                      //!< Nodes in the tree
 
-//! Calculate octant from position
-  int getOctant(const vect X, int i) {
-    int octant = 0;                                             // Initialize octant
-    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
-      octant += (X[d] > nodes[i].X[d]) << d;                    //  interleave bits and accumulate octant
-    }                                                           // End loop over dimensions
-    return octant;                                              // Return octant
+//! Octree is used for building the FMM tree structure as "nodes", then transformed to "cells" data structure
+  struct OctreeNode {
+    int          BODY;                                          //!< Index offset for first body in node
+    int          NBODY;                                         //!< Number of descendant bodies
+    int          NNODE;                                         //!< Number of descendant nodes
+    OctreeNode * CHILD[8];                                      //!< Pointer to child node
+    vec3         X;                                             //!< Coordinate at center
+  };
+
+  int          MAXLEVEL;                                        //!< Maximum level of tree
+  B_iter       B0;                                              //!< Iterator of first body
+  OctreeNode * N0;                                              //!< Octree root node
+
+private:
+//! Get number of binary tree nodes for a given number of bodies
+  inline int getNumBinNode(int n) const {
+    if (n <= NSPAWN) return 1;                                  // If less then threshold, use only one node
+    else return 4 * ((n - 1) / NSPAWN) - 1;                     // Else estimate number of binary tree nodes
   }
 
-//! Add child node and link it
-  void addChild(const int octant, int i) {
-    bigint pOff = ((1 << 3* nodes[i].LEVEL   ) - 1) / 7;        // Parent cell index offset
-    bigint cOff = ((1 << 3*(nodes[i].LEVEL+1)) - 1) / 7;        // Current cell index offset
-    vect x = nodes[i].X;                                        // Initialize new center position with old center
-    real r = nodes[i].R/2;                                      // Initialize new size
-    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
-      x[d] += r * (((octant & 1 << d) >> d) * 2 - 1);           //  Calculate new center position
-    }                                                           // End loop over dimensions
-    Node node;                                                  // Node structure
-    node.NBODY = node.ICHILD = 0;                               // Initialize child node counters
-    node.X = x;                                                 // Initialize child node center
-    node.R = r;                                                 // Initialize child node radius
-    node.LEVEL = nodes[i].LEVEL + 1;                            // Level of child node
-    node.I = ((nodes[i].I-pOff) << 3) + octant + cOff;          // Cell index of child node
-    nodes[i].ICHILD |= (1 << octant);                           // Flip bit of octant
-    nodes[i].CHILD[octant] = nodes.end()-nodes.begin();         // Link child node to its parent
-    nodes.push_back(node);                                      // Push child node into vector
+//! Get maximum number of binary tree nodes for a given number of bodies
+  inline int getMaxBinNode(int n) const {
+    return (4 * n) / NSPAWN;                                    // Conservative estimate of number of binary tree nodes
   }
 
-//! Add leaf to node
-  void addLeaf(B_iter b, int i) {
-    nodes[i].BODY[nodes[i].NBODY] = b;                          // Assign body iterator to leaf
-    nodes[i].NBODY++;                                           // Increment leaf counter
+//! Exclusive scan with offset
+  inline ivec8 exclusiveScan(ivec8 input, int offset) const {
+    ivec8 output;                                               // Output vector
+    for (int i=0; i<8; i++) {                                   // Loop over elements
+      output[i] = offset;                                       //  Set value
+      offset += input[i];                                       //  Increment offset
+    }                                                           // End loop over elements
+    return output;                                              // Return output vector
   }
 
-//! Split node and reassign leafs to child nodes
-  void splitNode(int i) {
-    for( int l=0; l!=NCRIT; ++l ) {                             // Loop over leafs in parent node
-      int octant = getOctant(nodes[i].BODY[l]->X,i);            //  Find the octant where the body belongs
-      if( !(nodes[i].ICHILD & (1 << octant)) ) {                //  If child doesn't exist in this octant
-        addChild(octant,i);                                     //   Add new child to list
-      }                                                         //  Endif for octant
-      int c = nodes[i].CHILD[octant];                           //  Set counter for child node
-      addLeaf(nodes[i].BODY[l],c);                              //  Add leaf to child
-      if( nodes[c].NBODY >= NCRIT ) {                           //  If there are still too many leafs
-        splitNode(c);                                           //   Split the node into smaller ones
-      }                                                         //  Endif for too many leafs
-    }                                                           // End loop over leafs
+//! Create an octree node
+  OctreeNode * makeOctNode(int begin, int end, vec3 X, bool nochild) const {
+    OctreeNode * octNode = new OctreeNode();                    // Allocate memory for single node
+    octNode->BODY = begin;                                      // Index of first body in node
+    octNode->NBODY = end - begin;                               // Number of bodies in node
+    octNode->NNODE = 1;                                         // Initialize counter for decendant nodes
+    octNode->X = X;                                             // Center position of node
+    if (nochild) {                                              // If node has no children
+      for (int i=0; i<8; i++) octNode->CHILD[i] = NULL;         //  Initialize pointers to children
+    }                                                           // End if for node children
+    return octNode;                                             // Return node
   }
 
-//! Traverse tree
-  void traverse(typename std::vector<Node>::iterator N) {
-    if( N->NBODY >= NCRIT ) {                                   // If node has children
-      for( int i=0; i!=8; ++i ) {                               // Loop over children
-        if( N->ICHILD & (1 << i) ) {                            //  If child exists in this octant
-          traverse(nodes.begin()+N->CHILD[i]);                  //   Recursively search child node
-        }                                                       //  Endif for octant
-      }                                                         // End loop over children
-    } else {                                                    //  If child doesn't exist
-      for( int i=0; i!=N->NBODY; ++i ) {                        //   Loop over leafs
-        N->BODY[i]->ICELL = N->I;                               //    Store cell index in bodies
-      }                                                         //   End loop over leafs
-    }                                                           //  Endif for child existence
+//! Count bodies in each octant using binary tree recursion
+  void countBodies(Bodies& bodies, int begin, int end, vec3 X, BinaryTreeNode * binNode) {
+    assert(getNumBinNode(end - begin) <= binNode->END - binNode->BEGIN + 1);
+    if (end - begin <= NSPAWN) {                                // If number of bodies is less than threshold
+      for (int i=0; i<8; i++) binNode->NBODY[i] = 0;            //  Initialize number of bodies in octant
+      binNode->LEFT = binNode->RIGHT = NULL;                    //  Initialize pointers to left and right child node
+      for (int i=begin; i<end; i++) {                           //  Loop over bodies in node
+        vec3 x = bodies[i].X;                                   //   Position of body
+        int octant = (x[0] > X[0]) + ((x[1] > X[1]) << 1) + ((x[2] > X[2]) << 2);// Which octant body belongs to
+        binNode->NBODY[octant]++;                               //   Increment body count in octant
+      }                                                         //  End loop over bodies in node
+    } else {                                                    // Else if number of bodies is larger than threshold
+      int mid = (begin + end) / 2;                              //  Split range of bodies in half
+      int numLeftNode = getNumBinNode(mid - begin);             //  Number of binary tree nodes on left branch
+      int numRightNode = getNumBinNode(end - mid);              //  Number of binary tree nodes on right branch
+      assert(numLeftNode + numRightNode <= binNode->END - binNode->BEGIN);
+      binNode->LEFT = binNode->BEGIN;                           //  Assign first memory address to left node pointer
+      binNode->LEFT->BEGIN = binNode->LEFT + 1;                 //  Assign next memory address to left begin pointer
+      binNode->LEFT->END = binNode->LEFT + numLeftNode;         //  Keep track of last memory address used by left
+      binNode->RIGHT = binNode->LEFT->END;                      //  Assign that same address to right node pointer
+      binNode->RIGHT->BEGIN = binNode->RIGHT + 1;               //  Assign next memory address to right begin pointer
+      binNode->RIGHT->END = binNode->RIGHT + numRightNode;      //  Keep track of last memory address used by right
+      countBodies(bodies, begin, mid, X, binNode->LEFT);        //  Recursive call for left branch
+      countBodies(bodies, mid, end, X, binNode->RIGHT);         //  Recursive call for right branch
+      binNode->NBODY = binNode->LEFT->NBODY + binNode->RIGHT->NBODY;// Sum contribution from both branches
+    }                                                           // End if for number of bodies
   }
 
-public:
-//! Grow tree from root
-  void grow(Bodies &bodies) {
+//! Sort bodies according to octant (Morton order)
+  void moveBodies(Bodies& bodies, Bodies& buffer, int begin, int end,
+                  BinaryTreeNode * binNode, ivec8 octantOffset, vec3 X) const {
+    if (binNode->LEFT == NULL) {                                // If there are no more child nodes
+      for (int i=begin; i<end; i++) {                           //  Loop over bodies
+        vec3 x = bodies[i].X;                                   //   Position of body
+        int octant = (x[0] > X[0]) + ((x[1] > X[1]) << 1) + ((x[2] > X[2]) << 2);// Which octant body belongs to`
+        buffer[octantOffset[octant]] = bodies[i];               //   Permute bodies out-of-place according to octant
+        octantOffset[octant]++;                                 //   Increment body count in octant
+      }                                                         //  End loop over bodies
+    } else {                                                    // Else if there are child nodes
+      int mid = (begin + end) / 2;                              //  Split range of bodies in half
+      moveBodies(bodies, buffer, begin, mid, binNode->LEFT, octantOffset, X);// Recursive call for left branch
+      octantOffset += binNode->LEFT->NBODY;                     //  Increment the octant offset for right branch
+      moveBodies(bodies, buffer, mid, end, binNode->RIGHT, octantOffset, X);// Recursive call for right branch
+    }                                                           // End if for child existance
+  }
+
+//! Build nodes of octree adaptively using a top-down approach based on recursion (uses task based thread parallelism)
+  OctreeNode * buildNodes(Bodies& bodies, Bodies& buffer, int begin, int end,
+                          BinaryTreeNode * binNode, vec3 X, int level=0, bool direction=false) {
+    assert(getMaxBinNode(end - begin) <= binNode->END - binNode->BEGIN);
+    if (begin == end) return NULL;                              // If no bodies left, return null pointer
+    if (end - begin <= NCRIT) {                                 // If number of bodies is less than threshold
+      if (direction)                                            //  If direction of data is from bodies to buffer
+        for (int i=begin; i<end; i++) buffer[i] = bodies[i];    //   Copy bodies to buffer
+      return makeOctNode(begin, end, X, true);                  //  Create an octree node and return it's pointer
+    }                                                           // End if for number of bodies
+    OctreeNode * octNode = makeOctNode(begin, end, X, false);   // Create an octree node with child nodes
+    countBodies(bodies, begin, end, X, binNode);                // Count bodies in each octant using binary recursion
+    ivec8 octantOffset = exclusiveScan(binNode->NBODY, begin);  // Exclusive scan to obtain offset from octant count
+    moveBodies(bodies, buffer, begin, end, binNode, octantOffset, X);// Sort bodies according to octant
+    BinaryTreeNode * binNodeOffset = binNode->BEGIN;            // Initialize pointer offset for binary tree nodes
+    for (int i=0; i<8; i++) {                                   // Loop over children
+      int maxBinNode = getMaxBinNode(binNode->NBODY[i]);        //  Get maximum number of binary tree nodes
+      assert(binNodeOffset + maxBinNode <= binNode->END);
+      vec3 Xchild = X;                                          //  Initialize center position of child node
+      real r = R0 / (1 << (level + 1));                         //  Radius of cells for child's level
+      for (int d=0; d<3; d++) {                                 //  Loop over dimensions
+        Xchild[d] += r * (((i & 1 << d) >> d) * 2 - 1);         //   Shift center position to that of child node
+      }                                                         //  End loop over dimensions
+      BinaryTreeNode binNodeChild[1];                           //  Allocate new root for this branch
+      binNodeChild->BEGIN = binNodeOffset;                      //  Assign first memory address from offset
+      binNodeChild->END = binNodeOffset + maxBinNode;           //  Keep track of last memory address
+      octNode->CHILD[i] = buildNodes(buffer, bodies,            //  Recursive call for each child
+        octantOffset[i], octantOffset[i] + binNode->NBODY[i],   //  Range of bodies is calcuated from octant offset
+        binNodeChild, Xchild, level+1, !direction);             //  Alternate copy direction bodies <-> buffer
+      binNodeOffset += maxBinNode;                              //  Increment offset for binNode memory address
+    }                                                           // End loop over children
+    for (int i=0; i<8; i++) {                                   // Loop over children
+      if (octNode->CHILD[i]) octNode->NNODE += octNode->CHILD[i]->NNODE;// If child exists increment child node counter
+    }                                                           // End loop over chlidren
+    return octNode;                                             // Return octree node
+  }
+
+//! Create cell data structure from nodes
+  void nodes2cells(OctreeNode * octNode, C_iter C, C_iter CN, int level=0, int iparent=0) {
+    C->PARENT = iparent;                                        // Index of parent cell
+    C->R      = R0 / (1 << level);                     // Cell radius
+    C->X      = octNode->X;                                     // Cell center
+    C->NDBODY = octNode->NBODY;                                 // Number of decendant bodies
+    C->BODY   = B0 + octNode->BODY;                             // Iterator of first body in cell
+    if (octNode->NNODE == 1) {                                  // If node has no children
+      C->CHILD  = 0;                                            //  Set index of first child cell to zero
+      C->NCHILD = 0;                                            //  Number of child cells
+      C->NCBODY = octNode->NBODY;                               //  Number of bodies in cell
+      assert(C->NCBODY > 0);
+      MAXLEVEL = std::max(MAXLEVEL, level);                     //  Update maximum level of tree
+    } else {                                                    // Else if node has children
+      C->NCBODY = 0;                                            //  Set number of bodies in cell to zero
+      int nchild = 0;                                           //  Initialize number of child cells
+      int octants[8];                                           //  Map of child index to octants (for when nchild < 8)
+      for (int i=0; i<8; i++) {                                 //  Loop over octants
+        if (octNode->CHILD[i]) {                                //   If child exists for that octant
+          octants[nchild] = i;                                  //    Map octant to child index
+          nchild++;                                             //    Increment child cell counter
+        }                                                       //   End if for child existance
+      }                                                         //  End loop over octants
+      C_iter Ci = CN;                                           //  CN points to the next free memory address
+      C->CHILD = Ci - Ci0;                                      //  Set Index of first child cell
+      C->NCHILD = nchild;                                       //  Number of child cells
+      assert(C->NCHILD > 0);
+      CN += nchild;                                             //  Increment next free memory address
+      for (int i=0; i<nchild; i++) {                            //  Loop over children
+        int octant = octants[i];                                //   Get octant from child index
+        nodes2cells(octNode->CHILD[octant], Ci, CN, level+1, C-Ci0);// Recursive call for each child
+        Ci++;                                                   //   Increment cell iterator
+        CN += octNode->CHILD[octant]->NNODE - 1;                //   Increment next free memory address
+      }                                                         //  End loop over children
+      for (int i=0; i<nchild; i++) {                            //  Loop over children
+        int octant = octants[i];                                //   Get octant from child index
+        delete octNode->CHILD[octant];                          //   Free child pointer to avoid memory leak
+      }                                                         //  End loop over children
+      MAXLEVEL = std::max(MAXLEVEL, level+1);                   //  Update maximum level of tree
+    }                                                           // End if for child existance
+  }
+
+//! Get Xmin and Xmax of domain
+  vec3Pair getBounds(B_iter BiBegin, B_iter BiEnd) {
+    assert(BiEnd - BiBegin > 0);
+    if (BiEnd - BiBegin < NSPAWN) {                             // If number of elements is small enough
+      vec3 Xmin = BiBegin->X, Xmax = BiBegin->X;                //  Initialize Xmin and Xmax with first value
+      for (B_iter B=BiBegin; B!=BiEnd; B++) {                   //  Loop over range of bodies
+        Xmin = min(B->X, Xmin);                                 //   Update Xmin
+        Xmax = max(B->X, Xmax);                                 //   Update Xmax
+      }                                                         //  End loop over range of bodies
+      return vec3Pair(Xmin, Xmax);                              //  Return Xmin and Xmax as pair
+    } else {                                                    // Else if number of elements are large
+      B_iter BiMid = BiBegin + (BiEnd - BiBegin) / 2;           //  Middle iterator
+      vec3Pair bounds0, bounds1;                                //  Pair : first Xmin : second Xmax
+      bounds0 = getBounds(BiBegin, BiMid);                      //  Recursive call with new task
+      bounds1 = getBounds(BiMid, BiEnd);                        //  Recursive call with old task
+      bounds0.first = min(bounds0.first, bounds1.first);        //  Minimum of the two Xmins
+      bounds0.second = max(bounds0.second, bounds1.second);     //  Maximum of the two Xmaxs
+      return bounds0;                                           //  Return Xmin and Xmax
+    }                                                           // End if for number fo elements
+  }
+
+protected:
+//! Grow tree structure top down
+  void growTree(Bodies &bodies) {
+    Bodies buffer = bodies;                                     // Copy bodies to buffer
     startTimer("Grow tree");                                    // Start timer
-    int octant;                                                 // In which octant is the body located?
-    Node node;                                                  // Node structure
-    node.LEVEL = node.NBODY = node.ICHILD = node.I = 0;         // Initialize root node counters
-    node.X = X0;                                                // Initialize root node center
-    node.R = R0;                                                // Initialize root node radius
-    nodes.push_back(node);                                      // Push child node into vector
-    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
-      int i = 0;                                                //  Reset node counter
-      while( nodes[i].NBODY >= NCRIT ) {                        //  While the nodes have children
-        nodes[i].NBODY++;                                       //   Increment the cumulative leaf counter
-        octant = getOctant(B->X,i);                             //   Find the octant where the body belongs
-        if( !(nodes[i].ICHILD & (1 << octant)) ) {              //   If child doesn't exist in this octant
-          addChild(octant,i);                                   //    Add new child to list
-        }                                                       //   Endif for child existence
-        i = nodes[i].CHILD[octant];                             //    Update node iterator to child
-      }                                                         //  End while loop
-      addLeaf(B,i);                                             //  Add body to node as leaf
-      if( nodes[i].NBODY >= NCRIT ) {                           //  If there are too many leafs
-        splitNode(i);                                           //   Split the node into smaller ones
-      }                                                         //  Endif for splitting
-    }                                                           // End loop over bodies
+    B0 = bodies.begin();                                        // Bodies iterator
+    BinaryTreeNode binNode[1];                                  // Allocate root node of binary tree
+    int maxBinNode = getMaxBinNode(bodies.size());              // Get maximum size of binary tree
+    binNode->BEGIN = new BinaryTreeNode[maxBinNode];            // Allocate array for binary tree nodes
+    binNode->END = binNode->BEGIN + maxBinNode;                 // Set end pointer
+    N0 = buildNodes(bodies, buffer, 0, bodies.size(), binNode, X0);// Build tree recursively
+    delete[] binNode->BEGIN;                                    // Deallocate binary tree array
     stopTimer("Grow tree",printNow);                            // Stop timer
   }
 
-//! Store cell index of all bodies
-  void setIndex() {
-    startTimer("Set index");                                    // Start timer
-    traverse(nodes.begin());                                    // Traverse tree
-    stopTimer("Set index",printNow);                            // Stop timer 
+//! Link tree structure
+  void linkTree(Cells &cells) {
+    startTimer("Link tree");                                    // Start timer
+    cells.resize(N0->NNODE);                                    // Allocate cells array
+    Ci0 = cells.begin();                                        // Cell begin iterator
+    nodes2cells(N0, Ci0, Ci0+1);                                // Convert nodes to cells recursively
+    delete N0;                                                  // Deallocate nodes
+    stopTimer("Link tree",printNow);                            // Stop timer
   }
+
+  void printTreeData(Cells &cells) {
+    std::cout << "--- FMM stats --------------------" << std::endl
+    << std::setw(stringLength) << std::left                      // Set format
+    << "Bodies"     << " : " << cells.front().NDBODY << std::endl// Print number of bodies
+    << std::setw(stringLength) << std::left                      // Set format
+    << "Cells"      << " : " << cells.size() << std::endl       // Print number of cells
+    << std::setw(stringLength) << std::left                      // Set format
+    << "Tree depth" << " : " << MAXLEVEL << std::endl           // Print number of levels
+#if COUNT
+    << std::setw(stringLength) << std::left                      // Set format
+    << "P2P calls"  << " : " << NP2P << std::endl               // Print number of P2P calls
+    << std::setw(stringLength) << std::left                      // Set format
+    << "M2L calls"  << " : " << NM2L << std::endl               // Print number of M2l calls
+#endif
+    << "--- FMM stats --------------------" << std::endl;
+  }
+public:
+  TopDown() : MAXLEVEL(0) {}
+
 };
 
 #endif

include/topdown2.h

+/*
+Copyright (C) 2011 by Rio Yokota, Simon Layton, Lorena Barba
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef topdown_h
+#define topdown_h
+#include "tree.h"
+
+//! Topdown tree constructor class
+template<Equation equation>
+class TopDown : public TreeStructure<equation> {
+public:
+  using Kernel<equation>::printNow;                             //!< Switch to print timings
+  using Kernel<equation>::startTimer;                           //!< Start timer for given event
+  using Kernel<equation>::stopTimer;                            //!< Stop timer for given event
+  using Kernel<equation>::X0;                                   //!< Center of root cell
+  using Kernel<equation>::R0;                                   //!< Radius of root cell
+
+private:
+//! Nodes are primitive cells
+  struct Node {
+    int LEVEL;                                                  //!< Level of node
+    int ICHILD;                                                 //!< Flag of empty child nodes
+    int NBODY;                                                  //!< Number of leafs in node
+    bigint I;                                                   //!< Cell index
+    bigint CHILD[8];                                            //!< Iterator offset of child nodes
+    B_iter BODY[NCRIT];                                         //!< Iterator for leafs
+    vec3 X;                                                     //!< Node center
+    real R;                                                     //!< Node radius
+  };
+  std::vector<Node> nodes;                                      //!< Nodes in the tree
+
+//! Calculate octant from position
+  int getOctant(const vec3 X, int i) {
+    int octant = 0;                                             // Initialize octant
+    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
+      octant += (X[d] > nodes[i].X[d]) << d;                    //  interleave bits and accumulate octant
+    }                                                           // End loop over dimensions
+    return octant;                                              // Return octant
+  }
+
+//! Add child node and link it
+  void addChild(const int octant, int i) {
+    bigint pOff = ((1 << 3* nodes[i].LEVEL   ) - 1) / 7;        // Parent cell index offset
+    bigint cOff = ((1 << 3*(nodes[i].LEVEL+1)) - 1) / 7;        // Current cell index offset
+    vec3 x = nodes[i].X;                                        // Initialize new center position with old center
+    real r = nodes[i].R/2;                                      // Initialize new size
+    for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
+      x[d] += r * (((octant & 1 << d) >> d) * 2 - 1);           //  Calculate new center position
+    }                                                           // End loop over dimensions
+    Node node;                                                  // Node structure
+    node.NBODY = node.ICHILD = 0;                               // Initialize child node counters
+    node.X = x;                                                 // Initialize child node center
+    node.R = r;                                                 // Initialize child node radius
+    node.LEVEL = nodes[i].LEVEL + 1;                            // Level of child node
+    node.I = ((nodes[i].I-pOff) << 3) + octant + cOff;          // Cell index of child node
+    nodes[i].ICHILD |= (1 << octant);                           // Flip bit of octant
+    nodes[i].CHILD[octant] = nodes.end()-nodes.begin();         // Link child node to its parent
+    nodes.push_back(node);                                      // Push child node into vector
+  }
+
+//! Add leaf to node
+  void addLeaf(B_iter b, int i) {
+    nodes[i].BODY[nodes[i].NBODY] = b;                          // Assign body iterator to leaf
+    nodes[i].NBODY++;                                           // Increment leaf counter
+  }
+
+//! Split node and reassign leafs to child nodes
+  void splitNode(int i) {
+    for( int l=0; l!=NCRIT; ++l ) {                             // Loop over leafs in parent node
+      int octant = getOctant(nodes[i].BODY[l]->X,i);            //  Find the octant where the body belongs
+      if( !(nodes[i].ICHILD & (1 << octant)) ) {                //  If child doesn't exist in this octant
+        addChild(octant,i);                                     //   Add new child to list
+      }                                                         //  Endif for octant
+      int c = nodes[i].CHILD[octant];                           //  Set counter for child node
+      addLeaf(nodes[i].BODY[l],c);                              //  Add leaf to child
+      if( nodes[c].NBODY >= NCRIT ) {                           //  If there are still too many leafs
+        splitNode(c);                                           //   Split the node into smaller ones
+      }                                                         //  Endif for too many leafs
+    }                                                           // End loop over leafs
+  }
+
+//! Traverse tree
+  void traverse(typename std::vector<Node>::iterator N) {
+    if( N->NBODY >= NCRIT ) {                                   // If node has children
+      for( int i=0; i!=8; ++i ) {                               // Loop over children
+        if( N->ICHILD & (1 << i) ) {                            //  If child exists in this octant
+          traverse(nodes.begin()+N->CHILD[i]);                  //   Recursively search child node
+        }                                                       //  Endif for octant
+      }                                                         // End loop over children
+    } else {                                                    //  If child doesn't exist
+      for( int i=0; i!=N->NBODY; ++i ) {                        //   Loop over leafs
+        N->BODY[i]->ICELL = N->I;                               //    Store cell index in bodies
+      }                                                         //   End loop over leafs
+    }                                                           //  Endif for child existence
+  }
+
+public:
+//! Grow tree from root
+  void grow(Bodies &bodies) {
+    startTimer("Grow tree");                                    // Start timer
+    int octant;                                                 // In which octant is the body located?
+    Node node;                                                  // Node structure
+    node.LEVEL = node.NBODY = node.ICHILD = node.I = 0;         // Initialize root node counters
+    node.X = X0;                                                // Initialize root node center
+    node.R = R0;                                                // Initialize root node radius
+    nodes.push_back(node);                                      // Push child node into vector
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
+      int i = 0;                                                //  Reset node counter
+      while( nodes[i].NBODY >= NCRIT ) {                        //  While the nodes have children
+        nodes[i].NBODY++;                                       //   Increment the cumulative leaf counter
+        octant = getOctant(B->X,i);                             //   Find the octant where the body belongs
+        if( !(nodes[i].ICHILD & (1 << octant)) ) {              //   If child doesn't exist in this octant
+          addChild(octant,i);                                   //    Add new child to list
+        }                                                       //   Endif for child existence
+        i = nodes[i].CHILD[octant];                             //    Update node iterator to child
+      }                                                         //  End while loop
+      addLeaf(B,i);                                             //  Add body to node as leaf
+      if( nodes[i].NBODY >= NCRIT ) {                           //  If there are too many leafs
+        splitNode(i);                                           //   Split the node into smaller ones
+      }                                                         //  Endif for splitting
+    }                                                           // End loop over bodies
+    stopTimer("Grow tree",printNow);                            // Stop timer
+  }
+
+//! Store cell index of all bodies
+  void setIndex() {
+    startTimer("Set index");                                    // Start timer
+    traverse(nodes.begin());                                    // Traverse tree
+    stopTimer("Set index",printNow);                            // Stop timer 
+  }
+};
+
+#endif
   using Kernel<equation>::sortCells;                            //!< Sort cells according to cell index
   using Kernel<equation>::X0;                                   //!< Center of root cell
   using Kernel<equation>::R0;                                   //!< Radius of root cell
+  using Kernel<equation>::Ci0;                                  //!< Begin iterator for target cells
+  using Kernel<equation>::Cj0;                                  //!< Begin iterator for source cells
   using Evaluator<equation>::NP2P;                              //!< Number of P2P kernel calls
   using Evaluator<equation>::NM2P;                              //!< Number of M2P kernel calls
   using Evaluator<equation>::NM2L;                              //!< Number of M2L kernel calls
   using Evaluator<equation>::traverse;                          //!< Traverse tree to get interaction list
   using Evaluator<equation>::traversePeriodic;                  //!< Traverse tree for periodic images
   using Evaluator<equation>::neighbor;                          //!< Traverse source tree to get neighbor list
+  using Evaluator<equation>::P2M;                               //!< Evaluate P2M kernel
+  using Evaluator<equation>::M2M;                               //!< Evaluate M2M kernel
+  using Evaluator<equation>::L2L;                               //!< Evaluate L2L kernel
+  using Evaluator<equation>::L2P;                               //!< Evaluate L2P kernel
   using Evaluator<equation>::evalP2M;                           //!< Evaluate P2M kernel
   using Evaluator<equation>::evalM2M;                           //!< Evaluate M2M kernel
   using Evaluator<equation>::evalM2L;                           //!< Evaluate M2L kernel
     begin = oldend;                                             // Set new begin index to old end index
   }
 
+//! Recursive call for upward pass
+  void upwardRecursion(C_iter C, C_iter C0) {
+    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
+      upwardRecursion(CC, C0);                                  //  Recursive call with new task
+    }                                                           // End loop over child cells
+    C->M = 0;                                                   // Initialize multipole expansion coefficients
+    C->L = 0;                                                   // Initialize local expansion coefficients
+    P2M(C);                                                     // P2M kernel
+    M2M(C);                                                     // M2M kernel
+  }
+
+//! Recursive call for downward pass
+  void downwardRecursion(C_iter C, C_iter C0) const {
+    L2L(C);                                                     // L2L kernel
+    L2P(C);                                                     // L2P kernel
+    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
+      downwardRecursion(CC, C0);                                //  Recursive call with new task
+    }                                                           // End loop over chlid cells
+  }
+
+//! Error optimization of Rcrit
+  void setRcrit(C_iter C, C_iter C0, real c) {
+    for (C_iter CC=C0+C->CHILD; CC!=C0+C->CHILD+C->NCHILD; CC++) {// Loop over child cells
+      setRcrit(CC, C0, c);                                      //  Recursive call with new task
+    }                                                           // End loop over child cells
+    for( int i=1; i<MTERM; ++i ) C->M[i] /= C->M[0];            // Normalize multipole expansion coefficients
+    real x = 1.0 / THETA;                                       // Inverse of theta
+#if ERROR_OPT
+    real a = c * powf(std::abs(C->M[0]),1.0/3);                 // Cell coefficient
+    for (int i=0; i<5; i++) {                                   // Newton-Rhapson iteration
+      real f = x * x - 2 * x + 1 - a * pow(x,-P);               //  Function value
+      real df = (P + 2) * x - 2 * (P + 1) + P / x;              //  Function derivative value
+      x -= f / df;                                              //  Increment x
+    }                                                           // End Newton-Rhapson iteration
+#endif
+    C->RCRIT *= x;                                              // Multiply Rcrit by error optimized parameter x
+  }
+
 protected:
 //! Get cell center and radius from cell index
   void getCenter(Cell &cell) {
     }                                                           // End loop over dimensions
   }
 
+//! Upward pass (P2M, M2M)
+  void upwardPass(Cells &cells) {
+    startTimer("Upward pass");                                  // Start timer
+    Ci0 = cells.begin();                                        // Set iterator of target root cell
+    Cj0 = cells.begin();                                        // Set iterator of source root cell
+    upwardRecursion(Ci0, Ci0);                                  // Recursive call for upward pass
+    real c = (1 - THETA) * (1 - THETA) / pow(THETA,P+2) / powf(std::abs(Ci0->M[0]),1.0/3); // Root coefficient
+    setRcrit(Ci0, Ci0, c);                                      // Error optimization of Rcrit
+    for (C_iter C=cells.begin(); C!=cells.begin()+9; C++) {     // Loop over top 2 levels of cells
+      C->RCRIT *= 10;                                           //  Prevent approximation
+    }                                                           // End loop over top 2 levels of cells
+    stopTimer("Upward pass",printNow);                          // Stop timer
+  }
+
+//! Downward pass (L2L, L2P)
+  void downwardPass(Cells &cells) {
+    startTimer("Downward pass");                                // Start timer
+    C_iter C0 = cells.begin();                                  // Root cell
+    L2P(C0);                                                    // If root is the only cell do L2P
+    for (C_iter CC=C0+C0->CHILD; CC!=C0+C0->CHILD+C0->NCHILD; CC++) {// Loop over child cells
+      downwardRecursion(CC, C0);                                //  Recursive call for downward pass
+    }                                                           // End loop over child cells
+    stopTimer("Downward pass",printNow);                        // Stop timer
+  }
+
 public:
 //! Group bodies into twig cells
   void bodies2twigs(Bodies &bodies, Cells &twigs) {
     evalM2P(cells);                                             // Evaluate queued M2P kernels (only GPU)
     evalP2P(cells);                                             // Evaluate queued P2P kernels (only GPU)
 #endif
+#if COMPARE
+    downwardPass(cells);
+#else
     evalL2L(cells);                                             // Evaluate all L2L kernels
     evalL2P(cells);                                             // Evaluate all L2P kernels
+#endif
     if(printNow) std::cout << "P2P: "  << NP2P
                            << " M2P: " << NM2P
                            << " M2L: " << NM2L << std::endl;
 typedef float              real;                                //!< Real number type on CPU
 typedef float              gpureal;                             //!< Real number type on GPU
 typedef std::complex<real> complex;                             //!< Complex number type
-typedef vec<3,real>        vect;                                //!< 3-D vector type
-
+typedef vec<3,real>        vec3;                                //!< 3-D vector type
+typedef vec<3,float>       fvec3;                               //!< Vector of 3 single precision types
+typedef vec<8,int>         ivec8;                               //!< Vector of 8 integer types
+typedef std::pair<vec3,vec3> vec3Pair;                          //!< Pair of vec3
 
 #ifndef KERNEL
 int MPIRANK    = 0;                                             //!< MPI comm rank
 int DEVICE     = 0;                                             //!< GPU device ID
 int IMAGES     = 0;                                             //!< Number of periodic image sublevels
 real THETA     = .5;                                            //!< Multipole acceptance criteria
-vect Xperiodic = 0;                                             //!< Coordinate offset of periodic image
+vec3 Xperiodic = 0;                                             //!< Coordinate offset of periodic image
 #if PAPI
 int PAPIEVENT  = PAPI_NULL;                                     //!< PAPI event handle
 #endif
 extern int DEVICE;                                              //!< GPU device ID
 extern int IMAGES;                                              //!< Number of periodic image sublevels
 extern real THETA;                                              //!< Multipole acceptance criteria
-extern vect Xperiodic;                                          //!< Coordinate offset of periodic image
+extern vec3 Xperiodic;                                          //!< Coordinate offset of periodic image
 #if PAPI
 extern int PAPIEVENT;                                           //!< PAPI event handle
 #endif
 
 const int  P        = 10;                                       //!< Order of expansions
 const int  NCRIT    = 100;                                      //!< Number of bodies per cell
+const int  NSPAWN   = 1000;                                     //!< Threshold of NDBODY for spawning new threads
 const int  MAXBODY  = 200000;                                   //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel
 const real CLET     = 2;                                        //!< LET opening critetia
   int         IBODY;                                            //!< Initial body numbering for sorting back
   int         IPROC;                                            //!< Initial process numbering for partitioning back
   bigint      ICELL;                                            //!< Cell index
-  vect        X;                                                //!< Position
+  vec3        X;                                                //!< Position
   real        SRC;                                              //!< Scalar source values
 };
 typedef std::vector<JBody>             JBodies;                 //!< Vector of source bodies
 //! Linked list of leafs (only used in fast/topdown.h)
 struct Leaf {
   int I;                                                        //!< Unique index for every leaf
-  vect X;                                                       //!< Coordinate of leaf
+  vec3 X;                                                       //!< Coordinate of leaf
   Leaf *NEXT;                                                   //!< Pointer to next leaf
 };
 typedef std::vector<Leaf>              Leafs;                   //!< Vector of leafs
   int  LEVEL;                                                   //!< Level in the tree structure
   int  NBODY;                                                   //!< Number of descendant leafs
   int  CHILD[8];                                                //!< Index of child node
-  vect X;                                                       //!< Coordinate at center
+  vec3 X;                                                       //!< Coordinate at center
   Leaf *BODY;                                                   //!< Pointer to first leaf
 };
 typedef std::vector<Node>              Nodes;                   //!< Vector of nodes
   int      PARENT;                                              //!< Iterator offset of parent cell
   int      CHILD;                                               //!< Iterator offset of child cells
   B_iter   BODY;                                                //!< Iterator of first leaf
-  vect     X;                                                   //!< Cell center
+  vec3     X;                                                   //!< Cell center
   real     R;                                                   //!< Cell radius
   real     RMAX;                                                //!< Max cell radius
   real     RCRIT;                                               //!< Critical cell radius
 
 //! Structure for Ewald summation
 struct Ewald {
-  vect K;                                                       //!< 3-D wave number vector
+  vec3 K;                                                       //!< 3-D wave number vector
   real REAL;                                                    //!< real part of wave
   real IMAG;                                                    //!< imaginary part of wave
 };
     for_i c+=b[i]*b[i];
     return c;
   }
+  friend vec min(const vec &b, const vec &c) {                     // Element-wise minimum
+    vec d;
+    for (int i=0; i<N; i++) d[i] = b[i] < c[i] ? b[i] : c[i];
+    return d;
+  }
+  friend vec max(const vec &b, const vec &c) {                     // Element-wise maximum
+    vec d;
+    for (int i=0; i<N; i++) d[i] = b[i] > c[i] ? b[i] : c[i];
+    return d;
+  }
 };
 
 #undef for_i
   vtkPoints *points[maxGroups];
   vtkPoints *hexPoints;
 public:
-  void setDomain(const real r0, const vect x0) {
+  void setDomain(const real r0, const vec3 x0) {
     hexPoints = vtkPoints::New();
     hexPoints->SetNumberOfPoints(8);
     hexPoints->SetPoint(0, x0[0]-r0, x0[1]-r0, x0[2]-r0);
     points[Igroup]->SetNumberOfPoints(Npoints);
   }
 
-  void setPoints(const int Igroup, const vect X) {
+  void setPoints(const int Igroup, const vec3 X) {
     points[Igroup]->SetPoint(I[Igroup],X[0],X[1],X[2]);
     I[Igroup]++;
   }

kernel/CPUCartesianLaplace.cxx

 template<int n, int kx, int ky , int kz, int d>
 struct DerivativeTerm {
   static const int coef = 1 - 2 * n;
-  static inline real kernel(const Lset &C, const vect &dist) {
+  static inline real kernel(const Lset &C, const vec3 &dist) {
     return coef * dist[d] * C[Index<kx,ky,kz>::I];
   }
 };
 template<int n, int kx, int ky , int kz>
 struct DerivativeTerm<n,kx,ky,kz,-1> {
   static const int coef = 1 - n;
-  static inline real kernel(const Lset &C, const vect&) {
+  static inline real kernel(const Lset &C, const vec3&) {
     return coef * C[Index<kx,ky,kz>::I];
   }
 };
   static const int nextflag = 5 - (kz < nz || kz == 1);
   static const int dim = kz == (nz-1) ? -1 : 2;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,kz-1,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ny,kz-1,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,4> {
   static const int nextflag = 3 - (ny == 0);
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 3 - (ky < ny || ky == 1);
   static const int dim = ky == (ny-1) ? -1 : 1;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,nx,ky-1,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ky-1,nz,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,2> {
   static const int nextflag = 1 - (nx == 0);
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 1 - (kx < nx || kx == 1);
   static const int dim = kx == (nx-1) ? -1 : 0;
   static const int n = nx + ny + nz;
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,kx-1,ny,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,kx-1,ny,nz,dim>::kernel(C,dist);
   }
 
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,0> {
-  static inline real loop(const Lset&, const vect&) {
+  static inline real loop(const Lset&, const vec3&) {
     return 0;
   }
 };
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct DerivativeSum<nx,ny,nz,kx,ky,0,5> {
-  static inline real loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vec3 &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,0,4>::loop(C,dist);
   }
 };
 
 template<int nx, int ny, int nz>
 struct Terms {
-  static inline void power(Lset &C, const vect &dist) {
+  static inline void power(Lset &C, const vec3 &dist) {
     Terms<nx,ny+1,nz-1>::power(C,dist);
     C[Index<nx,ny,nz>::I] = C[Index<nx,ny,nz-1>::I] * dist[2] / nz;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
     static const int n = nx + ny + nz;
     Terms<nx,ny+1,nz-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,nz>::I] = DerivativeSum<nx,ny,nz>::loop(C,dist) / n * invR2;
 
 template<int nx, int ny>
 struct Terms<nx,ny,0> {
-  static inline void power(Lset &C, const vect &dist) {
+  static inline void power(Lset &C, const vec3 &dist) {
     Terms<nx+1,0,ny-1>::power(C,dist);
     C[Index<nx,ny,0>::I] = C[Index<nx,ny-1,0>::I] * dist[1] / ny;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
     static const int n = nx + ny;
     Terms<nx+1,0,ny-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,0>::I] = DerivativeSum<nx,ny,0>::loop(C,dist) / n * invR2;
 
 template<int nx>
 struct Terms<nx,0,0> {
-  static inline void power(Lset &C, const vect &dist) {
+  static inline void power(Lset &C, const vec3 &dist) {
     Terms<0,0,nx-1>::power(C,dist);
     C[Index<nx,0,0>::I] = C[Index<nx-1,0,0>::I] * dist[0] / nx;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
+  static inline void derivative(Lset &C, const vec3 &dist, const real &invR2) {
     static const int n = nx;
     Terms<0,0,nx-1>::derivative(C,dist,invR2);
     C[Index<nx,0,0>::I] = DerivativeSum<nx,0,0>::loop(C,dist) / n * invR2;
 
 template<>
 struct Terms<0,0,0> {
-  static inline void power(Lset&, const vect&) {}
-  static inline void derivative(Lset&, const vect&, const real&) {}
+  static inline void power(Lset&, const vec3&) {}
+  static inline void derivative(Lset&, const vec3&, const real&) {}
   static inline void scale(Lset&) {}
 };
 
 };
 
 template<int PP>
-inline void getCoef(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   C[0] = invR;
   Terms<0,0,PP>::derivative(C,dist,invR2);
   Terms<0,0,PP>::scale(C);
 }
 
 template<>
-inline void getCoef<1>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<1>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   C[0] = invR;
   invR2 = -invR2;
   real x = dist[0], y = dist[1], z = dist[2];
 }
 
 template<>
-inline void getCoef<2>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<2>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   getCoef<1>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<3>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<3>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   getCoef<2>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<4>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<4>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   getCoef<3>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<5>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<5>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   getCoef<4>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
 }
 
 template<>
-inline void getCoef<6>(Lset &C, const vect &dist, real &invR2, const real &invR) {
+inline void getCoef<6>(Lset &C, const vec3 &dist, real &invR2, const real &invR) {
   getCoef<5>(C,dist,invR2,invR);
   real x = dist[0], y = dist[1], z = dist[2];
   real invR3 = invR * invR2;
   real Rmax = 0;
 //  setCenter(Ci);
   for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vect dist = Ci->X - B->X;
+    vec3 dist = Ci->X - B->X;
     real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
     Lset M;
   real Rmax = Ci->RMAX;
 //  setCenter(Ci);
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
-    vect dist = Ci->X - Cj->X;
+    vec3 dist = Ci->X - Cj->X;
     real R = std::sqrt(norm(dist)) + Cj->RCRIT;
     if( R > Rmax ) Rmax = R;
     Mset M;
 
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
-  vect dist = Ci->X - Cj->X - Xperiodic;
+  vec3 dist = Ci->X - Cj->X - Xperiodic;
   real invR2 = 1 / norm(dist);
   real invR  = Ci->M[0] * Cj->M[0] * std::sqrt(invR2);
   Lset C;
 template<>
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
   for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) {
-    vect dist = B->X - Cj->X - Xperiodic;
+    vec3 dist = B->X - Cj->X - Xperiodic;
     real invR2 = 1 / norm(dist);
     real invR  = Cj->M[0] * std::sqrt(invR2);
     Lset C;
 template<>
 void Kernel<Laplace>::L2L(C_iter Ci) const {
   C_iter Cj = Ci0 + Ci->PARENT;
-  vect dist = Ci->X - Cj->X;
+  vec3 dist = Ci->X - Cj->X;
   Lset C;
   C[0] = 1;
   Terms<0,0,P>::power(C,dist);
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
   for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vect dist = B->X - Ci->X;
+    vec3 dist = B->X - Ci->X;
     Lset C, L;
     C[0] = 1;
     Terms<0,0,P>::power(C,dist);

kernel/CPUEwaldLaplace.cxx

 void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real part on CPU
   for( B_iter Bi=Ci->BODY; Bi!=Ci->BODY+Ci->NDBODY; ++Bi ) {    // Loop over target bodies
     for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
-      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       for( int d=0; d<3; d++ ) {                                //   Loop over dimensions
         if( dist[d] < -R0 ) dist[d] += 2 * R0;                  //    Wrap domain so that target is always at
         if( dist[d] >= R0 ) dist[d] -= 2 * R0;                  //    the center of a [-R0,R0]^3 source cube
     for( int d=0; d<3; d++ ) B->TRG[d+1] *= scale;
   }
 
-  vect dipole = 0;
+  vec3 dipole = 0;
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
     dipole += (B->X - R0) * B->SRC;
   }

kernel/CPUP2P.cxx

   int i = 0;
   for ( ; i<n; i++) {
     real pot = 0;
-    vect acc = 0;
+    vec3 acc = 0;
     for (int j=i+1; j<n; j++) {
-      vect dX = B[i].X - B[j].X;
+      vec3 dX = B[i].X - B[j].X;
       real R2 = norm(dX) + EPS2;
       if (R2 != 0) {
         real invR2 = 1.0 / R2;
   int i = 0;
   for ( ; i<ni; i++) {
     real pot = 0;
-    vect acc = 0;
+    vec3 acc = 0;
     for (int j=0; j<nj; j++) {
-      vect dX = Bi[i].X - Bj[j].X - Xperiodic;
+      vec3 dX = Bi[i].X - Bj[j].X - Xperiodic;
       real R2 = norm(dX) + EPS2;
       if (R2 != 0) {
         real invR2 = 1.0f / R2;
 #else
   for( B_iter Bi=Ci->BODY; Bi!=Ci->BODY+Ci->NDBODY; ++Bi ) {    // Loop over target bodies
     real P0 = 0;                                                //  Initialize potential
-    vect F0 = 0;                                                //  Initialize force
+    vec3 F0 = 0;                                                //  Initialize force
     for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
-      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       real R2 = norm(dist) + EPS2;                              //   R^2
       real invR2 = 1.0 / R2;                                    //   1 / R^2
       if( R2 == 0 ) invR2 = 0;                                  //   Exclude self interaction
     int atypei = int(Bi->SRC);                                  //  Atom type of target
     for( B_iter Bj=Cj->BODY; Bj!=Cj->BODY+Cj->NDBODY; ++Bj ) {  //  Loop over source bodies
       int atypej = int(Bj->SRC);                                //   Atom type of source
-      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
+      vec3 dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       real R2 = norm(dist);                                     //   R squared
       if( R2 != 0 ) {                                           //   Exclude self interaction
         real rs = RSCALE[atypei*ATOMS+atypej];                  //    r scale

kernel/CPUSphericalLaplace.cxx

 
 namespace{
 //! Get r,theta,phi from x,y,z
-void cart2sph(real& r, real& theta, real& phi, vect dist) {
+void cart2sph(real& r, real& theta, real& phi, vec3 dist) {
   r = sqrt(norm(dist)) * (1 + EPS);                           // r = sqrt(x^2 + y^2 + z^2)
   if( r < EPS ) {                                             // If r == 0
     theta = 0;                                                //  theta can be anything so we set it to 0
   real Rmax = 0;
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Cj->BODY; B!=Cj->BODY+Cj->NCBODY; ++B ) {
-    vect dist = B->X - Cj->X;
+    vec3 dist = B->X - Cj->X;
     real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
     real rho, alpha, beta;
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   real Rmax = Ci->RMAX;
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
-    vect dist = Ci->X - Cj->X;
+    vec3 dist = Ci->X - Cj->X;
     real R = std::sqrt(norm(dist)) + Cj->RCRIT;
     if( R > Rmax ) Rmax = R;
     real rho, alpha, beta;
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
   complex Ynm[4*P*P], YnmTheta[4*P*P];
-  vect dist = Ci->X - Cj->X - Xperiodic;
+  vec3 dist = Ci->X - Cj->X - Xperiodic;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
   evalLocal(rho,alpha,beta,Ynm,YnmTheta);
   const complex I(0.,1.);                                       // Imaginary unit
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NDBODY; ++B ) {
-    vect dist = B->X - Cj->X - Xperiodic;
-    vect spherical = 0;
-    vect cartesian = 0;
+    vec3 dist = B->X - Cj->X - Xperiodic;
+    vec3 spherical = 0;
+    vec3 cartesian = 0;
     real r, theta, phi;
     cart2sph(r,theta,phi,dist);
     evalLocal(r,theta,phi,Ynm,YnmTheta);
   const complex I(0.,1.);
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   C_iter Cj = Ci0 + Ci->PARENT;
-  vect dist = Ci->X - Cj->X;
+  vec3 dist = Ci->X - Cj->X;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
   evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
   const complex I(0.,1.);                                       // Imaginary unit
   complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; ++B ) {
-    vect dist = B->X - Ci->X;
-    vect spherical = 0;
-    vect cartesian = 0;
+    vec3 dist = B->X - Ci->X;
+    vec3 spherical = 0;
+    vec3 cartesian = 0;
     real r, theta, phi;
     cart2sph(r,theta,phi,dist);
     evalMultipole(r,theta,phi,Ynm,YnmTheta);

unit_test/Makefile

 	$(SERIALRUN)
 
 serialrun: serialrun.cxx $(OBJECT)
-	$(CXX) $? $(LFLAGS) $(VFLAGS)
+	$(CXX) $? $(LFLAGS) $(VFLAGS) -DTOPDOWN -DCOMPARE
 	$(SERIALRUN)
 
 unsort: unsort.cxx $(OBJECT)

unit_test/fdgl.cxx

 struct JVertex {
   int       Ista;                                               //!< Start index of connection list
   int       Iend;                                               //!< End index of connection list
-  vect      X;                                                  //!< Position vector
+  vec3      X;                                                  //!< Position vector
 };
 
 //! Structure of vertices
 #if VTK
   vtkIdType Id;                                                 //!< VTK vertex ID
 #endif
-  vect      F;                                                  //!< Force vector
+  vec3      F;                                                  //!< Force vector
 };
 
 const std::string  INPUT_PATH = "";                             //!< Input file name
   Bodies jbodies;                                               // Define vector of source bodies
   Cells cells, jcells;                                          // Define vector of cells
   B_iter B = bodies.begin();                                    // Iterator of first body
-  vect Xmin = B->X;                                             // Initialize minimum of X
-  vect Xmax = B->X;                                             // Initialize maximum of X
+  vec3 Xmin = B->X;                                             // Initialize minimum of X
+  vec3 Xmax = B->X;                                             // Initialize maximum of X
   for( V_iter V=vertices.begin(); V!=vertices.end(); ++V, ++B ) {// Loop over vertices
     B->X = V->X;                                                //  Copy vertex position to body position
     B->SRC = -100;                                              //  Set source value
   for( V_iter VI=vertices.begin(); VI!=vertices.end(); ++VI ) { // Loop over target vertices
     for( int i=VI->Ista; i<VI->Iend; ++i ) {                    //  Loop over edges
       JV_iter VJ = jvertices.begin()+i;                         //   Get source vertex
-      vect dist = VI->X - VJ->X;                                //   Distance vector from source to target
+      vec3 dist = VI->X - VJ->X;                                //   Distance vector from source to target
       float R = sqrtf(norm(dist) + EPS2);                       //   Scalar distance from source to target
       float weight = (VI->Iend-VI->Ista) * (VJ->Iend-VJ->Ista); //   Weight based on number of edges
       VI->F -= dist / R * (R - l / weight);                     //   Spring force vector
 //! Move vertices
 void moveVertices() {
   for( V_iter V=vertices.begin(); V!=vertices.end(); ++V ) {    // Loop over vertices
-    vect dX;                                                    //  Position increment
+    vec3 dX;                                                    //  Position increment
     if( norm(V->F) < EPS ) dX = 0;                              //  Filter noisy movement
     else dX = V->F / std::sqrt(norm(V->F));                     //  Always move at constant speed
     V->X += dX;                                                 //  Update position

unit_test/poisson.cxx

     B->SRC = -(4 * L * L * norm(B->X) - 6 * L) * exp(-L * norm(B->X)) * dV * .25 / M_PI;// Gaussian source
     B->TRG = 0;                                                 //  Initialize target values
   }                                                             // End loop over bodies
-  vect X0 = 0;                                                  // Center of root cell
+  vec3 X0 = 0;                                                  // Center of root cell
   FMM.setX0(X0);                                                // Set center of root cell
   FMM.setR0(1);                                                 // Set radius of root cell
   FMM.stopTimer("Set bodies",FMM.printNow);                     // Stop timer