Commits

Rio Yokota committed 5dd5014

real_t -> real, complex_t -> complex.

Comments (0)

Files changed (30)

 .PHONY: docs
 
 ### CUDA path
-CUDA_INSTALL_PATH = /opt/nvidia/cudatoolkit/5.0.35.102
+CUDA_INSTALL_PATH = /usr/local/cuda
 
 ### VTK path
 VTK_INCLUDE_PATH = /usr/include/vtk-5.8
 EXPAND  = Spherical
 
 ### GCC compiler
-#CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -fopenmp -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
+CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -fopenmp -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 #CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 ### Intel compiler
 #CXX	= mpicxx -Wall -xHOST -O3 -openmp -funroll-loops -finline-functions -fPIC -ansi-alias -I../include
 ### PGI compiler
-CXX	= CC -O3 -I../include
+#CXX	= CC -O3 -I../include
 ### BG/P compiler
 #CXX	= mpixlcxx -qarch=450 -qtune=450 -I../include
 ### K computer

include/bottomup.h

   int getMaxLevel(Bodies &bodies) {
     const long N = bodies.size() * MPISIZE;                     // Number of bodies
     int level;                                                  // Max level
-    level = N >= NCRIT ? 1 + int(log(real_t(N / NCRIT)/M_LN2/3)) : 0;// Decide max level from N/Ncrit
+    level = N >= NCRIT ? 1 + int(log(N / NCRIT)/M_LN2/3) : 0;   // Decide max level from N/Ncrit
     if( level < 2 ) level = 2;
-    int MPIlevel = int(log(real_t(MPISIZE-1)) / M_LN2 / 3) + 1; // Level of local root cell
+    int MPIlevel = int(log(MPISIZE-1) / M_LN2 / 3) + 1;         // Level of local root cell
     if( MPISIZE == 1 ) MPIlevel = 0;                            // For serial execution local root cell is root cell
     if( MPIlevel > level ) {                                    // If process hierarchy is deeper than tree
       std::cout << "Process hierarchy is deeper than tree @ rank" << MPIRANK << std::endl;
     bigint i;                                                   // Levelwise cell index
     if( level == -1 ) level = getMaxLevel(bodies);              // Decide max level
     bigint off = ((1 << 3*level) - 1) / 7;                      // Offset for each level
-    real_t r = R0 / (1 << (level-1));                           // Radius at finest level
+    real r = R0 / (1 << (level-1));                             // Radius at finest level
     vec<3,int> nx;                                              // 3-D cell index
     if( end == 0 ) end = bodies.size();                         // Default size is all bodies
     for( int b=begin; b!=end; ++b ) {                           // Loop over bodies

include/dataset.h

 
 //! Evaluate relative L2 norm error
   void evalError(Bodies &bodies, Bodies &bodies2,
-                 real_t &diff1, real_t &norm1, real_t &diff2, real_t &norm2, bool ewald=false) {
-    real_t p = 0, p2 = 0;                                         // Total energy
+                 real &diff1, real &norm1, real &diff2, real &norm2, bool ewald=false) {
+    real p = 0, p2 = 0;                                         // Total energy
     B_iter B2 = bodies2.begin();                                // Set iterator for bodies2
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B, ++B2 ) {// Loop over bodies & bodies2
 #ifdef DEBUG
   }
 
 //! Print relative L2 norm error
-  void printError(real_t diff1, real_t norm1, real_t diff2, real_t norm2) {
+  void printError(real diff1, real norm1, real diff2, real norm2) {
     std::cout << std::setw(stringLength) << std::left
               << "Error (pot)" << " : " << std::sqrt(diff1/norm1) << std::endl;
     std::cout << std::setw(stringLength) << std::left
 
 //! Evaluate relative L2 norm error
   void evalError(Bodies &bodies, Bodies &bodies2,
-                 real_t &diff1, real_t &norm1, real_t &diff2, real_t &norm2) {
+                 real &diff1, real &norm1, real &diff2, real &norm2) {
     B_iter B2 = bodies2.begin();                                // Set iterator for bodies2
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B, ++B2 ) {// Loop over bodies & bodies2
 #ifdef DEBUG
   }
 
 //! Print relative L2 norm error
-  void printError(real_t diff1, real_t norm1, real_t diff2, real_t norm2) {
+  void printError(real diff1, real norm1, real diff2, real norm2) {
     std::cout << std::setw(stringLength) << std::left
               << "Error (pot)" << " : " << std::sqrt(diff1/norm1) << std::endl;
     std::cout << std::setw(stringLength) << std::left

include/evaluator.h

 template<Equation equation>
 class Evaluator : public Dataset<equation> {
 private:
-  real_t      timeM2L;                                          //!< M2L execution time
-  real_t      timeM2P;                                          //!< M2P execution time
-  real_t      timeP2P;                                          //!< P2P execution time
+  real        timeM2L;                                          //!< M2L execution time
+  real        timeM2P;                                          //!< M2P execution time
+  real        timeP2P;                                          //!< P2P execution time
 
 protected:
   C_iter      CiB;                                              //!< icells begin per call
   Maps        flagM2P;                                          //!< Existance of periodic image for M2P
   Maps        flagP2P;                                          //!< Existance of periodic image for P2P
 
-  real_t      NP2P;                                             //!< Number of P2P kernel calls
-  real_t      NM2P;                                             //!< Number of M2P kernel calls
-  real_t      NM2L;                                             //!< Number of M2L kernel calls
+  real        NP2P;                                             //!< Number of P2P kernel calls
+  real        NM2P;                                             //!< Number of M2P kernel calls
+  real        NM2L;                                             //!< Number of M2L kernel calls
 
 public:
   using Kernel<equation>::startTimer;                           //!< Start timer for given event
       cellStack.pop();                                          //  Pop traversal stack
       for( C_iter Cj=Cj0+C->CHILD; Cj!=Cj0+C->CHILD+C->NCHILD; ++Cj ) {// Loop over cell's children
         vect dX = Ci->X - Cj->X - Xperiodic;                    //   Distance vector from source to target
-        real_t Rq = std::sqrt(norm(dX));                        //   Scalar distance
+        real Rq = std::sqrt(norm(dX));                          //   Scalar distance
         if( Rq * THETA < Ci->R + Cj->R && Cj->NCHILD == 0 ) {   //   If twigs are close
           evalEwaldReal(Ci,Cj);                                 //    Ewald real part
         } else if( Cj->NCHILD != 0 ) {                          //   If cells are not twigs
   int getPeriodicRange() {
     int prange = 0;                                             //  Range of periodic images
     for( int i=0; i!=IMAGES; ++i ) {                            //  Loop over periodic image sublevels
-      prange += int(std::pow(real_t(3),i));                     //   Accumulate range of periodic images
+      prange += int(pow(3,i));                                  //   Accumulate range of periodic images
     }                                                           //  End loop over perioidc image sublevels
     return prange;                                              // Return range of periodic images
   }
       for( int d=0; d!=3; ++d ) {                               //  Loop over dimension
         B->X[d] = drand48() * 2 - 1;                            //   Initialize positions
       }                                                         //  End loop over dimension
-      real_t r = std::sqrt(norm(B->X));                         //  Distance from center
+      real r = std::sqrt(norm(B->X));                           //  Distance from center
       for( int d=0; d!=3; ++d ) {                               //  Loop over dimension
         B->X[d] /= r * 1.1;                                     //   Normalize positions
       }                                                         //  End loop over dimension
 //! Use multipole acceptance criteria to determine whether to approximate, do P2P, or subdivide
   void interact(C_iter Ci, C_iter Cj, PairQueue &pairQueue) {
     vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
-    real_t Rq = std::sqrt(norm(dX));                            // Scalar distance
+    real Rq = std::sqrt(norm(dX));                              // Scalar distance
     if( Rq * THETA > Ci->R + Cj->R ) {                          // If distance if far enough
       approximate(Ci,Cj);                                       //  Use approximate kernels, e.g. M2L, M2P
     } else if(Ci->NCHILD == 0 && Cj->NCHILD == 0) {             // Else if both cells are leafs
   C_iter               Cj0;                                     //!< Begin iterator for source cells
 
   int                  ATOMS;                                   //!< Number of atom types in Van der Waals
-  std::vector<real_t>  RSCALE;                                  //!< Scaling parameter for Van der Waals
-  std::vector<real_t>  GSCALE;                                  //!< Scaling parameter for Van der Waals
-  real_t               KSIZE;                                   //!< Number of waves in Ewald summation
-  real_t               ALPHA;                                   //!< Scaling parameter for Ewald summation
-  real_t               SIGMA;                                   //!< Scaling parameter for Ewald summation
+  std::vector<real>    RSCALE;                                  //!< Scaling parameter for Van der Waals
+  std::vector<real>    GSCALE;                                  //!< Scaling parameter for Van der Waals
+  real                 KSIZE;                                   //!< Number of waves in Ewald summation
+  real                 ALPHA;                                   //!< Scaling parameter for Ewald summation
+  real                 SIGMA;                                   //!< Scaling parameter for Ewald summation
 
   std::vector<int>     keysHost;                                //!< Offsets for rangeHost
   std::vector<int>     rangeHost;                               //!< Offsets for sourceHost
   gpureal             *sourceDevc;                              //!< Sources on device
   gpureal             *targetDevc;                              //!< Targets on device
 
-  real_t *factorial;                                            //!< Factorial
-  real_t *prefactor;                                            //!< \f$ \sqrt{ \frac{(n - |m|)!}{(n + |m|)!} } \f$
-  real_t *Anm;                                                  //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
-  complex_t *Cnm;                                               //!< M2L translation matrix \f$ C_{jn}^{km} \f$
+  real *factorial;                                              //!< Factorial
+  real *prefactor;                                              //!< \f$ \sqrt{ \frac{(n - |m|)!}{(n + |m|)!} } \f$
+  real *Anm;                                                    //!< \f$ (-1)^n / \sqrt{ \frac{(n + m)!}{(n - m)!} } \f$
+  complex *Cnm;                                                 //!< M2L translation matrix \f$ C_{jn}^{km} \f$
 public:
   vect X0;                                                      //!< Center of root cell
-  real_t R0;                                                    //!< Radius of root cell
+  real R0;                                                      //!< Radius of root cell
 
 private:
-  real_t getBmax(vect const&X, C_iter C) const {
-    real_t rad = C->R;
-    real_t dx = rad+std::abs(X[0]-C->X[0]);
-    real_t dy = rad+std::abs(X[1]-C->X[1]);
-    real_t dz = rad+std::abs(X[2]-C->X[2]);
+  real getBmax(vect const&X, C_iter C) const {
+    real rad = C->R;
+    real dx = rad+std::abs(X[0]-C->X[0]);
+    real dy = rad+std::abs(X[1]-C->X[1]);
+    real dz = rad+std::abs(X[2]-C->X[2]);
     return std::sqrt( dx*dx + dy*dy + dz*dz );
   }
 
 protected:
   void setCenter(C_iter C) const {
-    real_t m = 0;
+    real m = 0;
     vect X = 0;
     for( B_iter B=C->LEAF; B!=C->LEAF+C->NCLEAF; ++B ) {
       m += std::abs(B->SRC);
   }
 
 //! Evaluate solid harmonics \f$ r^n Y_{n}^{m} \f$
-  void evalMultipole(real_t rho, real_t alpha, real_t beta, complex_t *Ynm, complex_t *YnmTheta) const {
-    const complex_t I(0.,1.);                                   // Imaginary unit
-    real_t x = std::cos(alpha);                                 // x = cos(alpha)
-    real_t y = std::sin(alpha);                                 // y = sin(alpha)
-    real_t fact = 1;                                            // Initialize 2 * m + 1
-    real_t pn = 1;                                              // Initialize Legendre polynomial Pn
-    real_t rhom = 1;                                            // Initialize rho^m
+  void evalMultipole(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
+    const complex I(0.,1.);                                     // Imaginary unit
+    real x = std::cos(alpha);                                   // x = cos(alpha)
+    real y = std::sin(alpha);                                   // y = sin(alpha)
+    real fact = 1;                                              // Initialize 2 * m + 1
+    real pn = 1;                                                // Initialize Legendre polynomial Pn
+    real rhom = 1;                                              // Initialize rho^m
     for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
-      complex_t eim = std::exp(I * real_t(m * beta));           //  exp(i * m * beta)
-      real_t p = pn;                                            //  Associated Legendre polynomial Pnm
+      complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
+      real p = pn;                                              //  Associated Legendre polynomial Pnm
       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
       int nmn = m * m;                                          //  Index of Ynm for m < 0
       Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^m * Ynm for m > 0
       Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
-      real_t p1 = p;                                            //  Pnm-1
+      real p1 = p;                                              //  Pnm-1
       p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
       rhom *= rho;                                              //  rho^m
-      real_t rhon = rhom;                                       //  rho^n
+      real rhon = rhom;                                         //  rho^n
       for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm
         Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
-        real_t p2 = p1;                                         //   Pnm-2
+        real p2 = p1;                                           //   Pnm-2
         p1 = p;                                                 //   Pnm-1
         p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
         YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
   }
 
 //! Evaluate singular harmonics \f$ r^{-n-1} Y_n^m \f$
-  void evalLocal(real_t rho, real_t alpha, real_t beta, complex_t *Ynm, complex_t *YnmTheta) const {
-    const complex_t I(0.,1.);                                   // Imaginary unit
-    real_t x = std::cos(alpha);                                 // x = cos(alpha)
-    real_t y = std::sin(alpha);                                 // y = sin(alpha)
-    real_t fact = 1;                                            // Initialize 2 * m + 1
-    real_t pn = 1;                                              // Initialize Legendre polynomial Pn
-    real_t rhom = 1.0 / rho;                                    // Initialize rho^(-m-1)
+  void evalLocal(real rho, real alpha, real beta, complex *Ynm, complex *YnmTheta) const {
+    const complex I(0.,1.);                                     // Imaginary unit
+    real x = std::cos(alpha);                                   // x = cos(alpha)
+    real y = std::sin(alpha);                                   // y = sin(alpha)
+    real fact = 1;                                              // Initialize 2 * m + 1
+    real pn = 1;                                                // Initialize Legendre polynomial Pn
+    real rhom = 1.0 / rho;                                      // Initialize rho^(-m-1)
     for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
-      complex_t eim = std::exp(I * real_t(m * beta));           //  exp(i * m * beta)
-      real_t p = pn;                                            //  Associated Legendre polynomial Pnm
+      complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
+      real p = pn;                                              //  Associated Legendre polynomial Pnm
       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
       int nmn = m * m;                                          //  Index of Ynm for m < 0
       Ynm[npn] = rhom * p * prefactor[npn] * eim;               //  rho^(-m-1) * Ynm for m > 0
       Ynm[nmn] = std::conj(Ynm[npn]);                           //  Use conjugate relation for m < 0
-      real_t p1 = p;                                            //  Pnm-1
+      real p1 = p;                                              //  Pnm-1
       p = x * (2 * m + 1) * p1;                                 //  Pnm using recurrence relation
       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
       rhom /= rho;                                              //  rho^(-m-1)
-      real_t rhon = rhom;                                       //  rho^(-n-1)
+      real rhon = rhom;                                         //  rho^(-n-1)
       for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm for m > 0
         Ynm[nmm] = std::conj(Ynm[npm]);                         //   Use conjugate relation for m < 0
-        real_t p2 = p1;                                         //   Pnm-2
+        real p2 = p1;                                           //   Pnm-2
         p1 = p;                                                 //   Pnm-1
         p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);//   Pnm using recurrence relation
         YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * prefactor[npm] * eim;// theta derivative
 //! Set center of root cell
   void setX0(vect x0) {X0 = x0;}
 //! Set radius of root cell
-  void setR0(real_t r0) {R0 = r0;}
+  void setR0(real r0) {R0 = r0;}
 
 //! Get center of root cell
   vect getX0() const {return X0;}
 //! Get radius of root cell
-  real_t getR0() const {return R0;}
+  real getR0() const {return R0;}
 
 //! Set center and size of root cell
-  void setDomain(Bodies &bodies, vect x0=0, real_t r0=M_PI) {
+  void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
     vect xmin,xmax;                                             // Min,Max of domain
     for( int d=0; d!=3; ++d ) {                                 //  Loop over each dimension
       xmin[d] = x0[d] - r0;                                     //   Initialize xmin
 
 //! Precalculate M2L translation matrix
   void preCalculation() {
-    const complex_t I(0.,1.);                                   // Imaginary unit
-    factorial = new real_t [P];                                 // Factorial
-    prefactor = new real_t [P*P];                               // sqrt( (n - |m|)! / (n + |m|)! )
-    Anm       = new real_t [P*P];                               // (-1)^n / sqrt( (n + m)! / (n - m)! )
-    Cnm       = new complex_t [P*P*P*P];                        // M2L translation matrix Cjknm
+    const complex I(0.,1.);                                     // Imaginary unit
+    factorial = new real  [P];                                  // Factorial
+    prefactor = new real  [P*P];                                // sqrt( (n - |m|)! / (n + |m|)! )
+    Anm       = new real  [P*P];                                // (-1)^n / sqrt( (n + m)! / (n - m)! )
+    Cnm       = new complex [P*P*P*P];                          // M2L translation matrix Cjknm
 
     factorial[0] = 1;                                           // Initialize factorial
     for( int n=1; n!=P; ++n ) {                                 // Loop to P
       for( int m=-n; m<=n; ++m ) {                              //  Loop over m in Anm
         int nm = n*n+n+m;                                       //   Index of Anm
         int nabsm = abs(m);                                     //   |m|
-        real_t fnmm = EPS;                                      //   Initialize (n - m)!
+        real fnmm = EPS;                                        //   Initialize (n - m)!
         for( int i=1; i<=n-m; ++i ) fnmm *= i;                  //   (n - m)!
-        real_t fnpm = EPS;                                      //   Initialize (n + m)!
+        real fnpm = EPS;                                        //   Initialize (n + m)!
         for( int i=1; i<=n+m; ++i ) fnpm *= i;                  //   (n + m)!
-        real_t fnma = 1.0;                                      //   Initialize (n - |m|)!
+        real fnma = 1.0;                                        //   Initialize (n - |m|)!
         for( int i=1; i<=n-nabsm; ++i ) fnma *= i;              //   (n - |m|)!
-        real_t fnpa = 1.0;                                      //   Initialize (n + |m|)!
+        real fnpa = 1.0;                                        //   Initialize (n + |m|)!
         for( int i=1; i<=n+nabsm; ++i ) fnpa *= i;              //   (n + |m|)!
         prefactor[nm] = std::sqrt(fnma/fnpa);                   //   sqrt( (n - |m|)! / (n + |m|)! )
         Anm[nm] = ODDEVEN(n)/std::sqrt(fnmm*fnpm);              //   (-1)^n / sqrt( (n + m)! / (n - m)! )
         for( int n=0, nm=0; n!=P; ++n ) {                       //   Loop over n in Cjknm
           for( int m=-n; m<=n; ++m, ++nm, ++jknm ) {            //    Loop over m in Cjknm
             const int jnkm = (j+n)*(j+n)+j+n+m-k;               //     Index C_{j+n}^{m-k}
-            Cnm[jknm] = std::pow(I,real_t(abs(k-m)-abs(k)-abs(m)))//     Cjknm
-                      * real_t(ODDEVEN(j)*Anm[nm]*Anm[jk]/Anm[jnkm]) * EPS;
+            Cnm[jknm] = std::pow(I,real(abs(k-m)-abs(k)-abs(m)))//     Cjknm
+                      * real(ODDEVEN(j)*Anm[nm]*Anm[jk]/Anm[jnkm]) * EPS;
           }                                                     //    End loop over m in Cjknm
         }                                                       //   End loop over n in Cjknm
       }                                                         //  End loop over in k in Cjknm
   }
 
 //! Set paramters for Ewald summation
-  void setEwald(real_t ksize, real_t alpha, real_t sigma) {
+  void setEwald(real ksize, real alpha, real sigma) {
     KSIZE = ksize;                                              // Set number of waves
     ALPHA = alpha;                                              // Set scaling parameter
     SIGMA = sigma;                                              // Set scaling parameter

include/parallelfmm.h

             bool send = false;                                  //     Initialize logical for sending
             if( IMAGES == 0 ) {                                 //     If free boundary condition
               Xperiodic = 0;                                    //      Set periodic coordinate offset
-              real_t R = getDistance(C,xminAll[irank],xmaxAll[irank]);//  Get distance to other domain
+              real R = getDistance(C,xminAll[irank],xmaxAll[irank]);//  Get distance to other domain
               send |= CLET * C->R > THETA * R - EPS2;           //      If the cell seems close enough for P2P
             } else {                                            //     If periodic boundary condition
               for( int ix=-1; ix<=1; ++ix ) {                   //      Loop over x periodic direction
                     Xperiodic[0] = ix * 2 * R0;                 //         Coordinate offset for x periodic direction
                     Xperiodic[1] = iy * 2 * R0;                 //         Coordinate offset for y periodic direction
                     Xperiodic[2] = iz * 2 * R0;                 //         Coordinate offset for z periodic direction
-                    real_t R = getDistance(C,xminAll[irank],xmaxAll[irank]);// Get distance to other domain
+                    real R = getDistance(C,xminAll[irank],xmaxAll[irank]);// Get distance to other domain
                     send |= CLET * C->R > THETA * R - EPS2;     //         If the cell seems close enough for P2P
                   }                                             //        End loop over z periodic direction
                 }                                               //       End loop over y periodic direction
   }
 
 //! Get disatnce to other domain
-  real_t getDistance(C_iter C, vect xmin, vect xmax) {
+  real getDistance(C_iter C, vect xmin, vect xmax) {
     vect dist;                                                  // Distance vector
     for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
       dist[d] = (C->X[d] + Xperiodic[d] > xmax[d])*             //  Calculate the distance between cell C and
                 (C->X[d] + Xperiodic[d] < xmin[d])*             //  Take the differnece from xmin or xmax
                 (C->X[d] + Xperiodic[d] - xmin[d]);             //  or 0 if between xmin and xmax
     }                                                           // End loop over dimensions
-    real_t R = std::sqrt(norm(dist));                           // Scalar distance
+    real R = std::sqrt(norm(dist));                             // Scalar distance
     return R;
   }
 
       bool divide = false;                                      //  Initialize logical for dividing
       if( IMAGES == 0 ) {                                       //  If free boundary condition
         Xperiodic = 0;                                          //   Set periodic coordinate offset
-        real_t R = getDistance(CC,xmin,xmax);                   //   Get distance to other domain
+        real R = getDistance(CC,xmin,xmax);                     //   Get distance to other domain
         divide |= CLET * CC->R > THETA * R - EPS2;              //   If the cell seems too close and not twig
       } else {                                                  //  If periodic boundary condition
         for( int ix=-1; ix<=1; ++ix ) {                         //   Loop over x periodic direction
               Xperiodic[0] = ix * 2 * R0;                       //      Coordinate offset for x periodic direction
               Xperiodic[1] = iy * 2 * R0;                       //      Coordinate offset for y periodic direction
               Xperiodic[2] = iz * 2 * R0;                       //      Coordinate offset for z periodic direction
-              real_t R = getDistance(CC,xmin,xmax);             //      Get distance to other domain
+              real R = getDistance(CC,xmin,xmax);               //      Get distance to other domain
               divide |= CLET * CC->R > THETA * R - EPS2;        //      If the cell seems too close and not twig
             }                                                   //     End loop over z periodic direction
           }                                                     //    End loop over y periodic direction
 
 //! Check total charge
   void checkSumMass(Cells &cells) {
-    real_t localMass = 0;
+    real localMass = 0;
     for( C_iter C=cells.begin(); C!=cells.end(); ++C ) {
       if( C->NCHILD == 0 ) {
         localMass += std::abs(C->M[0]);
       }
     }
-    real_t globalMass;
+    real globalMass;
     MPI_Allreduce(&localMass,&globalMass,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
     print("localMass : ",0);
     print(localMass);

include/partition.h

 private:
 //! Split domain according to iSplit
   void splitDomain(bigint iSplit, int l, int d) {
-    real_t X = (iSplit + 1) * (XMAX[l][d] - XMIN[l][d]) / numCells1D + XMIN[l][d];// Coordinate corresponding to iSplit
+    real X = (iSplit + 1) * (XMAX[l][d] - XMIN[l][d]) / numCells1D + XMIN[l][d];// Coordinate corresponding to iSplit
     XMAX[l+1] = XMAX[l];                                        // Set XMAX for next subdivision
     XMIN[l+1] = XMIN[l];                                        // Set XMIN for next subdivision
     if( color[l+1][0] % 2 == 0 ) {                              // If on left side
   ~Partition() {}
 
 //! Set bounds of domain to be partitioned
-  void setGlobDomain(Bodies &bodies, vect x0=0, real_t r0=M_PI) {
+  void setGlobDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
     numCells1D = 1 << getMaxLevel(bodies);                      // Number of leaf cells in each dimensions
     B_iter B = bodies.begin();                                  // Reset body iterator
     MPI_Datatype MPI_TYPE = getType(XMIN[0][0]);                // Get MPI data type

include/topdown.h

     bigint CHILD[8];                                            //!< Iterator offset of child nodes
     B_iter LEAF[NCRIT];                                         //!< Iterator for leafs
     vect X;                                                     //!< Node center
-    real_t R;                                                   //!< Node radius
+    real R;                                                     //!< Node radius
   };
   std::vector<Node> nodes;                                      //!< Nodes in the tree
 
     bigint pOff = ((1 << 3* nodes[i].LEVEL   ) - 1) / 7;        // Parent cell index offset
     bigint cOff = ((1 << 3*(nodes[i].LEVEL+1)) - 1) / 7;        // Current cell index offset
     vect x = nodes[i].X;                                        // Initialize new center position with old center
-    real_t r = nodes[i].R/2;                                    // Initialize new size
+    real r = nodes[i].R/2;                                      // Initialize new size
     for( int d=0; d!=3; ++d ) {                                 // Loop over dimensions
       x[d] += r * (((octant & 1 << d) >> d) * 2 - 1);           //  Calculate new center position
     }                                                           // End loop over dimensions
 #include <iostream>
 #include <list>
 #include <map>
-#include <pthread.h>
 #include <queue>
 #include <stack>
 #include <string>
 #include <omp.h>
 #endif
 
-typedef unsigned             bigint;                            //!< Big integer type
-typedef float                real_t;                            //!< Real number type on CPU
-typedef float                gpureal;                           //!< Real number type on GPU
-typedef std::complex<real_t> complex_t;                         //!< Complex number type
-typedef vec<3,real_t>        vect;                              //!< 3-D vector type
+typedef unsigned           bigint;                              //!< Big integer type
+typedef float              real;                                //!< Real number type on CPU
+typedef float              gpureal;                             //!< Real number type on GPU
+typedef std::complex<real> complex;                             //!< Complex number type
+typedef vec<3,real>        vect;                                //!< 3-D vector type
 
 
 #ifndef KERNEL
 int MPISIZE    = 1;                                             //!< MPI comm size
 int DEVICE     = 0;                                             //!< GPU device ID
 int IMAGES     = 0;                                             //!< Number of periodic image sublevels
-real_t THETA   = .5;                                            //!< Multipole acceptance criteria
+real THETA     = .5;                                            //!< Multipole acceptance criteria
 vect Xperiodic = 0;                                             //!< Coordinate offset of periodic image
 #if PAPI
 int PAPIEVENT  = PAPI_NULL;                                     //!< PAPI event handle
 extern int MPISIZE;                                             //!< MPI comm size
 extern int DEVICE;                                              //!< GPU device ID
 extern int IMAGES;                                              //!< Number of periodic image sublevels
-extern real_t THETA;                                            //!< Multipole acceptance criteria
+extern real THETA;                                              //!< Multipole acceptance criteria
 extern vect Xperiodic;                                          //!< Coordinate offset of periodic image
 #if PAPI
 extern int PAPIEVENT;                                           //!< PAPI event handle
 const int  NCRIT    = 64;                                       //!< Number of bodies per cell
 const int  MAXBODY  = 50000;                                    //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel
-const real_t CLET   = 2;                                        //!< LET opening critetia
-const real_t EPS    = 1e-6;                                     //!< Single/double precision epsilon
-const real_t EPS2   = 0;                                        //!< Softening parameter (squared)
-const real_t R2MIN  = 0.0001;                                   //!< Minimum value for L-J R^2
-const real_t R2MAX  = 100.0;                                    //!< Maximum value for L-J R^2
+const real CLET     = 2;                                        //!< LET opening critetia
+const real EPS      = 1e-6;                                     //!< Single/double precision epsilon
+const real EPS2     = 0;                                        //!< Softening parameter (squared)
+const real R2MIN    = 0.0001;                                   //!< Minimum value for L-J R^2
+const real R2MAX    = 100.0;                                    //!< Maximum value for L-J R^2
 const int  GPUS     = 3;                                        //!< Number of GPUs per node
 const int  THREADS  = 64;                                       //!< Number of threads per thread-block
 
 const int NTERM = P*(P+1)/2;                                    //!< Number of Spherical multipole/local terms
 
 #if Cartesian
-typedef vec<MTERM,real_t>                      Mset;            //!< Multipole coefficient type for Cartesian
-typedef vec<LTERM,real_t>                      Lset;            //!< Local coefficient type for Cartesian
+typedef vec<MTERM,real>                        Mset;            //!< Multipole coefficient type for Cartesian
+typedef vec<LTERM,real>                        Lset;            //!< Local coefficient type for Cartesian
 #elif Spherical
-typedef vec<NTERM,complex_t>                   Mset;            //!< Multipole coefficient type for spherical
-typedef vec<NTERM,complex_t>                   Lset;            //!< Local coefficient type for spherical
+typedef vec<NTERM,complex>                     Mset;            //!< Multipole coefficient type for spherical
+typedef vec<NTERM,complex>                     Lset;            //!< Local coefficient type for spherical
 #endif
 typedef std::vector<bigint>                    Bigints;         //!< Vector of big integer types
 
   int         IPROC;                                            //!< Initial process numbering for partitioning back
   bigint      ICELL;                                            //!< Cell index
   vect        X;                                                //!< Position
-  real_t      SRC;                                              //!< Scalar source values
+  real        SRC;                                              //!< Scalar source values
 };
 typedef std::vector<JBody>             JBodies;                 //!< Vector of source bodies
 typedef std::vector<JBody>::iterator   JB_iter;                 //!< Iterator for source body vector
 
 //! Structure of bodies
 struct Body : public JBody {
-  vec<4,real_t> TRG;                                            //!< Scalar+vector target values
+  vec<4,real> TRG;                                              //!< Scalar+vector target values
   bool operator<(const Body &rhs) const {                       //!< Overload operator for comparing body index
     return this->IBODY < rhs.IBODY;                             //!< Comparison function for body index
   }
   int      CHILD;                                               //!< Iterator offset of child cells
   B_iter   LEAF;                                                //!< Iterator of first leaf
   vect     X;                                                   //!< Cell center
-  real_t   R;                                                   //!< Cell radius
-  real_t   RMAX;                                                //!< Max cell radius
-  real_t   RCRIT;                                               //!< Critical cell radius
+  real     R;                                                   //!< Cell radius
+  real     RMAX;                                                //!< Max cell radius
+  real     RCRIT;                                               //!< Critical cell radius
   Mset     M;                                                   //!< Multipole coefficients
   Lset     L;                                                   //!< Local coefficients
 };
 
 //! Structure for Ewald summation
 struct Ewald {
-  vect   K;                                                     //!< 3-D wave number vector
-  real_t REAL;                                                  //!< real_t part of wave
-  real_t IMAG;                                                  //!< imaginary part of wave
+  vect K;                                                       //!< 3-D wave number vector
+  real REAL;                                                    //!< real part of wave
+  real IMAG;                                                    //!< imaginary part of wave
 };
 typedef std::vector<Ewald>             Ewalds;                  //!< Vector of Ewald summation types
 typedef std::vector<Ewald>::iterator   E_iter;                  //!< Iterator for Ewald summation types
   vtkPoints *points[maxGroups];
   vtkPoints *hexPoints;
 public:
-  void setDomain(const real_t r0, const vect x0) {
+  void setDomain(const real r0, const vect x0) {
     hexPoints = vtkPoints::New();
     hexPoints->SetNumberOfPoints(8);
     hexPoints->SetPoint(0, x0[0]-r0, x0[1]-r0, x0[2]-r0);

kernel/CPUCartesianLaplace.cxx

 
 template<int nx, int ny, int nz>
 struct Index {
-  static const int        I = Index<nx,ny+1,nz-1>::I + 1;
-  static constexpr real_t F = Index<nx,ny,nz-1>::F * nz;
+  static const int      I = Index<nx,ny+1,nz-1>::I + 1;
+  static constexpr real F = Index<nx,ny,nz-1>::F * nz;
 };
 
 template<int nx, int ny>
 struct Index<nx,ny,0> {
-  static const int        I = Index<nx+1,0,ny-1>::I + 1;
-  static constexpr real_t F = Index<nx,ny-1,0>::F * ny;
+  static const int      I = Index<nx+1,0,ny-1>::I + 1;
+  static constexpr real F = Index<nx,ny-1,0>::F * ny;
 };
 
 template<int nx>
 struct Index<nx,0,0> {
-  static const int        I = Index<0,0,nx-1>::I + 1;
-  static constexpr real_t F = Index<nx-1,0,0>::F * nx;
+  static const int      I = Index<0,0,nx-1>::I + 1;
+  static constexpr real F = Index<nx-1,0,0>::F * nx;
 };
 
 template<>
 struct Index<0,0,0> {
-  static const int        I = 0;
-  static constexpr real_t F = 1;
+  static const int      I = 0;
+  static constexpr real F = 1;
 };
 
 
 template<int n, int kx, int ky , int kz, int d>
 struct DerivativeTerm {
   static const int coef = 1 - 2 * n;
-  static inline real_t kernel(const Lset &C, const vect &dist) {
+  static inline real kernel(const Lset &C, const vect &dist) {
     return coef * dist[d] * C[Index<kx,ky,kz>::I];
   }
 };
 template<int n, int kx, int ky , int kz>
 struct DerivativeTerm<n,kx,ky,kz,-1> {
   static const int coef = 1 - n;
-  static inline real_t kernel(const Lset &C, const vect&) {
+  static inline real kernel(const Lset &C, const vect&) {
     return coef * C[Index<kx,ky,kz>::I];
   }
 };
   static const int nextflag = 5 - (kz < nz || kz == 1);
   static const int dim = kz == (nz-1) ? -1 : 2;
   static const int n = nx + ny + nz;
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,kz-1,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ny,kz-1,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,4> {
   static const int nextflag = 3 - (ny == 0);
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 3 - (ky < ny || ky == 1);
   static const int dim = ky == (ny-1) ? -1 : 1;
   static const int n = nx + ny + nz;
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ky-1,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,nx,ky-1,nz,dim>::kernel(C,dist);
   }
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,2> {
   static const int nextflag = 1 - (nx == 0);
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dist);
   }
 };
   static const int nextflag = 1 - (kx < nx || kx == 1);
   static const int dim = kx == (nx-1) ? -1 : 0;
   static const int n = nx + ny + nz;
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,kx-1,ny,nz,nextflag>::loop(C,dist)
          + DerivativeTerm<n,kx-1,ny,nz,dim>::kernel(C,dist);
   }
 
 template<int nx, int ny, int nz, int kx, int ky, int kz>
 struct DerivativeSum<nx,ny,nz,kx,ky,kz,0> {
-  static inline real_t loop(const Lset&, const vect&) {
+  static inline real loop(const Lset&, const vect&) {
     return 0;
   }
 };
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct DerivativeSum<nx,ny,nz,kx,ky,0,5> {
-  static inline real_t loop(const Lset &C, const vect &dist) {
+  static inline real loop(const Lset &C, const vect &dist) {
     return DerivativeSum<nx,ny,nz,nx,ny,0,4>::loop(C,dist);
   }
 };
     Terms<nx,ny+1,nz-1>::power(C,dist);
     C[Index<nx,ny,nz>::I] = C[Index<nx,ny,nz-1>::I] * dist[2] / nz;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real_t &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx + ny + nz;
     Terms<nx,ny+1,nz-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,nz>::I] = DerivativeSum<nx,ny,nz>::loop(C,dist) / n * invR2;
     Terms<nx+1,0,ny-1>::power(C,dist);
     C[Index<nx,ny,0>::I] = C[Index<nx,ny-1,0>::I] * dist[1] / ny;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real_t &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx + ny;
     Terms<nx+1,0,ny-1>::derivative(C,dist,invR2);
     C[Index<nx,ny,0>::I] = DerivativeSum<nx,ny,0>::loop(C,dist) / n * invR2;
     Terms<0,0,nx-1>::power(C,dist);
     C[Index<nx,0,0>::I] = C[Index<nx-1,0,0>::I] * dist[0] / nx;
   }
-  static inline void derivative(Lset &C, const vect &dist, const real_t &invR2) {
+  static inline void derivative(Lset &C, const vect &dist, const real &invR2) {
     static const int n = nx;
     Terms<0,0,nx-1>::derivative(C,dist,invR2);
     C[Index<nx,0,0>::I] = DerivativeSum<nx,0,0>::loop(C,dist) / n * invR2;
 template<>
 struct Terms<0,0,0> {
   static inline void power(Lset&, const vect&) {}
-  static inline void derivative(Lset&, const vect&, const real_t&) {}
+  static inline void derivative(Lset&, const vect&, const real&) {}
   static inline void scale(Lset&) {}
 };
 
 
 template<int nx, int ny, int nz, int kx=nx, int ky=ny, int kz=nz>
 struct M2MSum {
-  static inline real_t kernel(const Lset &C, const Mset &M) {
+  static inline real kernel(const Lset &C, const Mset &M) {
     return M2MSum<nx,ny,nz,kx,ky,kz-1>::kernel(C,M)
          + C[Index<nx-kx,ny-ky,nz-kz>::I]*M[Index<kx,ky,kz>::I];
   }
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct M2MSum<nx,ny,nz,kx,ky,0> {
-  static inline real_t kernel(const Lset &C, const Mset &M) {
+  static inline real kernel(const Lset &C, const Mset &M) {
     return M2MSum<nx,ny,nz,kx,ky-1,nz>::kernel(C,M)
          + C[Index<nx-kx,ny-ky,nz>::I]*M[Index<kx,ky,0>::I];
   }
 
 template<int nx, int ny, int nz, int kx>
 struct M2MSum<nx,ny,nz,kx,0,0> {
-  static inline real_t kernel(const Lset &C, const Mset &M) {
+  static inline real kernel(const Lset &C, const Mset &M) {
     return M2MSum<nx,ny,nz,kx-1,ny,nz>::kernel(C,M)
          + C[Index<nx-kx,ny,nz>::I]*M[Index<kx,0,0>::I];
   }
 
 template<int nx, int ny, int nz>
 struct M2MSum<nx,ny,nz,0,0,0> {
-  static inline real_t kernel(const Lset&, const Mset&) { return 0; }
+  static inline real kernel(const Lset&, const Mset&) { return 0; }
 };
 
 
 template<int nx, int ny, int nz, int kx=0, int ky=0, int kz=P-nx-ny-nz>
 struct M2LSum {
-  static inline real_t kernel(const Lset &L, const Mset &M) {
+  static inline real kernel(const Lset &L, const Mset &M) {
     return M2LSum<nx,ny,nz,kx,ky+1,kz-1>::kernel(L,M)
          + M[Index<kx,ky,kz>::I] * L[Index<nx+kx,ny+ky,nz+kz>::I];
   }
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct M2LSum<nx,ny,nz,kx,ky,0> {
-  static inline real_t kernel(const Lset &L, const Mset &M) {
+  static inline real kernel(const Lset &L, const Mset &M) {
     return M2LSum<nx,ny,nz,kx+1,0,ky-1>::kernel(L,M)
          + M[Index<kx,ky,0>::I] * L[Index<nx+kx,ny+ky,nz>::I];
   }
 
 template<int nx, int ny, int nz, int kx>
 struct M2LSum<nx,ny,nz,kx,0,0> {
-  static inline real_t kernel(const Lset &L, const Mset &M) {
+  static inline real kernel(const Lset &L, const Mset &M) {
     return M2LSum<nx,ny,nz,0,0,kx-1>::kernel(L,M)
          + M[Index<kx,0,0>::I] * L[Index<nx+kx,ny,nz>::I];
   }
 
 template<int nx, int ny, int nz>
 struct M2LSum<nx,ny,nz,0,0,0> {
-  static inline real_t kernel(const Lset&, const Mset&) { return 0; }
+  static inline real kernel(const Lset&, const Mset&) { return 0; }
 };
 
 
 template<int nx, int ny, int nz, int kx=0, int ky=0, int kz=P-nx-ny-nz>
 struct LocalSum {
-  static inline real_t kernel(const Lset &C, const Lset &L) {
+  static inline real kernel(const Lset &C, const Lset &L) {
     return LocalSum<nx,ny,nz,kx,ky+1,kz-1>::kernel(C,L)
          + C[Index<kx,ky,kz>::I] * L[Index<nx+kx,ny+ky,nz+kz>::I];
   }
 
 template<int nx, int ny, int nz, int kx, int ky>
 struct LocalSum<nx,ny,nz,kx,ky,0> {
-  static inline real_t kernel(const Lset &C, const Lset &L) {
+  static inline real kernel(const Lset &C, const Lset &L) {
     return LocalSum<nx,ny,nz,kx+1,0,ky-1>::kernel(C,L)
          + C[Index<kx,ky,0>::I] * L[Index<nx+kx,ny+ky,nz>::I];
   }
 
 template<int nx, int ny, int nz, int kx>
 struct LocalSum<nx,ny,nz,kx,0,0> {
-  static inline real_t kernel(const Lset &C, const Lset &L) {
+  static inline real kernel(const Lset &C, const Lset &L) {
     return LocalSum<nx,ny,nz,0,0,kx-1>::kernel(C,L)
          + C[Index<kx,0,0>::I] * L[Index<nx+kx,ny,nz>::I];
   }
 
 template<int nx, int ny, int nz>
 struct LocalSum<nx,ny,nz,0,0,0> {
-  static inline real_t kernel(const Lset&, const Lset&) { return 0; }
+  static inline real kernel(const Lset&, const Lset&) { return 0; }
 };
 
 
 };
 
 template<int PP>
-inline void getCoef(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef(Lset &C, const vect &dist, real &invR2, const real &invR) {
   C[0] = invR;
   Terms<0,0,PP>::derivative(C,dist,invR2);
   Terms<0,0,PP>::scale(C);
 }
 
 template<>
-inline void getCoef<1>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<1>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   C[0] = invR;
   invR2 = -invR2;
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
   C[1] = x * invR3;
   C[2] = y * invR3;
   C[3] = z * invR3;
 }
 
 template<>
-inline void getCoef<2>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<2>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<1>(C,dist,invR2,invR);
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
-  real_t invR5 = 3 * invR3 * invR2;
-  real_t t = x * invR5;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
+  real invR5 = 3 * invR3 * invR2;
+  real t = x * invR5;
   C[4] = x * t + invR3;
   C[5] = y * t;
   C[6] = z * t;
 }
 
 template<>
-inline void getCoef<3>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<3>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<2>(C,dist,invR2,invR);
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
-  real_t invR5 = 3 * invR3 * invR2;
-  real_t invR7 = 5 * invR5 * invR2;
-  real_t t = x * x * invR7;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
+  real invR5 = 3 * invR3 * invR2;
+  real invR7 = 5 * invR5 * invR2;
+  real t = x * x * invR7;
   C[10] = x * (t + 3 * invR5);
   C[11] = y * (t +     invR5);
   C[12] = z * (t +     invR5);
 }
 
 template<>
-inline void getCoef<4>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<4>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<3>(C,dist,invR2,invR);
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
-  real_t invR5 = 3 * invR3 * invR2;
-  real_t invR7 = 5 * invR5 * invR2;
-  real_t invR9 = 7 * invR7 * invR2;
-  real_t t = x * x * invR9;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
+  real invR5 = 3 * invR3 * invR2;
+  real invR7 = 5 * invR5 * invR2;
+  real invR9 = 7 * invR7 * invR2;
+  real t = x * x * invR9;
   C[20] = x * x * (t + 6 * invR7) + 3 * invR5;
   C[21] = x * y * (t + 3 * invR7);
   C[22] = x * z * (t + 3 * invR7);
 }
 
 template<>
-inline void getCoef<5>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<5>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<4>(C,dist,invR2,invR);
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
-  real_t invR5 = 3 * invR3 * invR2;
-  real_t invR7 = 5 * invR5 * invR2;
-  real_t invR9 = 7 * invR7 * invR2;
-  real_t invR11 = 9 * invR9 * invR2;
-  real_t t = x * x * invR11;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
+  real invR5 = 3 * invR3 * invR2;
+  real invR7 = 5 * invR5 * invR2;
+  real invR9 = 7 * invR7 * invR2;
+  real invR11 = 9 * invR9 * invR2;
+  real t = x * x * invR11;
   C[35] = x * x * x * (t + 10 * invR9) + 15 * x * invR7;
   C[36] = x * x * y * (t +  6 * invR9) +  3 * y * invR7;
   C[37] = x * x * z * (t +  6 * invR9) +  3 * z * invR7;
 }
 
 template<>
-inline void getCoef<6>(Lset &C, const vect &dist, real_t &invR2, const real_t &invR) {
+inline void getCoef<6>(Lset &C, const vect &dist, real &invR2, const real &invR) {
   getCoef<5>(C,dist,invR2,invR);
-  real_t x = dist[0], y = dist[1], z = dist[2];
-  real_t invR3 = invR * invR2;
-  real_t invR5 = 3 * invR3 * invR2;
-  real_t invR7 = 5 * invR5 * invR2;
-  real_t invR9 = 7 * invR7 * invR2;
-  real_t invR11 = 9 * invR9 * invR2;
-  real_t invR13 = 11 * invR11 * invR2;
-  real_t t = x * x * invR13;
+  real x = dist[0], y = dist[1], z = dist[2];
+  real invR3 = invR * invR2;
+  real invR5 = 3 * invR3 * invR2;
+  real invR7 = 5 * invR5 * invR2;
+  real invR9 = 7 * invR7 * invR2;
+  real invR11 = 9 * invR9 * invR2;
+  real invR13 = 11 * invR11 * invR2;
+  real t = x * x * invR13;
   C[56] = x * x * x * x * (t + 15 * invR11) + 45 * x * x * invR9 + 15 * invR7;
   C[57] = x * x * x * y * (t + 10 * invR11) + 15 * x * y * invR9;
   C[58] = x * x * x * z * (t + 10 * invR11) + 15 * x * z * invR9;
 
 template<>
 void Kernel<Laplace>::P2M(C_iter Ci) {
-  real_t Rmax = 0;
+  real Rmax = 0;
 //  setCenter(Ci);
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = Ci->X - B->X;
-    real_t R = std::sqrt(norm(dist));
+    real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
     Lset M;
     M[0] = B->SRC;
 
 template<>
 void Kernel<Laplace>::M2M(C_iter Ci) {
-  real_t Rmax = Ci->RMAX;
+  real Rmax = Ci->RMAX;
 //  setCenter(Ci);
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
     vect dist = Ci->X - Cj->X;
-    real_t R = std::sqrt(norm(dist)) + Cj->RCRIT;
+    real R = std::sqrt(norm(dist)) + Cj->RCRIT;
     if( R > Rmax ) Rmax = R;
     Mset M;
     Lset C;
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
   vect dist = Ci->X - Cj->X - Xperiodic;
-  real_t invR2 = 1 / norm(dist);
-  real_t invR  = Cj->M[0] * std::sqrt(invR2);
+  real invR2 = 1 / norm(dist);
+  real invR  = Cj->M[0] * std::sqrt(invR2);
   Lset C;
   getCoef<P>(C,dist,invR2,invR);
   sumM2L<P>(Ci->L,C,Cj->M);
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Cj->X - Xperiodic;
-    real_t invR2 = 1 / norm(dist);
-    real_t invR  = Cj->M[0] * std::sqrt(invR2);
+    real invR2 = 1 / norm(dist);
+    real invR  = Cj->M[0] * std::sqrt(invR2);
     Lset C;
     getCoef<P>(C,dist,invR2,invR);
     sumM2P(B,C,Cj->M);

kernel/CPUEwaldLaplace.cxx

 #undef KERNEL
 
 namespace {
-void dft(Ewalds &ewalds, Bodies &bodies, real_t R0) {
-  real_t scale = M_PI / R0;
+void dft(Ewalds &ewalds, Bodies &bodies, real R0) {
+  real scale = M_PI / R0;
 #pragma omp parallel for
   for( int i=0; i<int(ewalds.size()); ++i ) {
     E_iter E = ewalds.begin() + i;
     E->REAL = E->IMAG = 0;
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
-      real_t th = 0;
+      real th = 0;
       for( int d=0; d<3; d++ ) th += E->K[d] * B->X[d] * scale;
       E->REAL += B->SRC * cos(th);
       E->IMAG += B->SRC * sin(th);
   }
 }
 
-void idft(Ewalds &ewalds, Bodies &bodies, real_t R0) {
-  real_t scale = M_PI / R0;
+void idft(Ewalds &ewalds, Bodies &bodies, real R0) {
+  real scale = M_PI / R0;
 #pragma omp parallel for
   for( int i=0; i<int(bodies.size()); ++i ) {
     B_iter B = bodies.begin() + i;
-    vec<4,real_t> TRG = 0;
+    vec<4,real> TRG = 0;
     for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
-      real_t th = 0;
+      real th = 0;
       for( int d=0; d<3; d++ ) th += E->K[d] * B->X[d] * scale;
-      real_t dtmp = E->REAL * sin(th) - E->IMAG * cos(th);
+      real dtmp = E->REAL * sin(th) - E->IMAG * cos(th);
       TRG[0]   += E->REAL * cos(th) + E->IMAG * sin(th);
       for( int d=0; d<3; d++ ) TRG[d+1] -= dtmp * E->K[d];
     }
 }
 
 template<>
-void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real_t part on CPU
+void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real part on CPU
   for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
     B_iter Bi = Ci->LEAF + i;                                   //  Target body iterator
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
         if( dist[d] < -R0 ) dist[d] += 2 * R0;                  //    Wrap domain so that target is always at
         if( dist[d] >= R0 ) dist[d] -= 2 * R0;                  //    the center of a [-R0,R0]^3 source cube
       }                                                         //   End loop over dimensions
-      real_t R2 = norm(dist);                                   //   R^2
+      real R2 = norm(dist);                                     //   R^2
       if( R2 != 0 ) {                                           //   Exclude self interaction
-        real_t R2s = R2 * ALPHA * ALPHA;                        //    (R * alpha)^2
-        real_t Rs = std::sqrt(R2s);                             //    R * alpha
-        real_t invRs = 1 / Rs;                                  //    1 / (R * alpha)
-        real_t invR2s = invRs * invRs;                          //    1 / (R * alpha)^2
-        real_t invR3s = invR2s * invRs;                         //    1 / (R * alpha)^3
-        real_t dtmp = Bj->SRC * (M_2_SQRTPI * exp(-R2s) * invR2s + erfc(Rs) * invR3s);
+        real R2s = R2 * ALPHA * ALPHA;                          //    (R * alpha)^2
+        real Rs = std::sqrt(R2s);                               //    R * alpha
+        real invRs = 1 / Rs;                                    //    1 / (R * alpha)
+        real invR2s = invRs * invRs;                            //    1 / (R * alpha)^2
+        real invR3s = invR2s * invRs;                           //    1 / (R * alpha)^3
+        real dtmp = Bj->SRC * (M_2_SQRTPI * exp(-R2s) * invR2s + erfc(Rs) * invR3s);
         dtmp *= ALPHA * ALPHA * ALPHA;                          //    Scale temporary value
-        Bi->TRG[0] += Bj->SRC * erfc(Rs) * invRs * ALPHA;       //    Ewald real_t potential
-        Bi->TRG[1] -= dist[0] * dtmp;                           //    x component of Ewald real_t force
-        Bi->TRG[2] -= dist[1] * dtmp;                           //    y component of Ewald real_t force
-        Bi->TRG[3] -= dist[2] * dtmp;                           //    z component of Ewald real_t force
+        Bi->TRG[0] += Bj->SRC * erfc(Rs) * invRs * ALPHA;       //    Ewald real potential
+        Bi->TRG[1] -= dist[0] * dtmp;                           //    x component of Ewald real force
+        Bi->TRG[2] -= dist[1] * dtmp;                           //    y component of Ewald real force
+        Bi->TRG[3] -= dist[2] * dtmp;                           //    z component of Ewald real force
       }                                                         //   End if for self interaction
     }                                                           //  End loop over source bodies
   }                                                             // End loop over target bodies
 
 template<>
 void Kernel<Laplace>::EwaldWave(Bodies &bodies) const {         // Ewald wave part on CPU
-  real_t scale = M_PI / R0;
-  real_t coef = .25 / M_PI / M_PI / SIGMA / R0;
-  real_t coef2 = scale * scale / (4 * ALPHA * ALPHA);
+  real scale = M_PI / R0;
+  real coef = .25 / M_PI / M_PI / SIGMA / R0;
+  real coef2 = scale * scale / (4 * ALPHA * ALPHA);
 
   Ewalds ewalds;
-  real_t kmaxsq = KSIZE * KSIZE;
+  real kmaxsq = KSIZE * KSIZE;
   int kmax = int(KSIZE);
   for( int l=0; l<=kmax; l++ ) {
     int mmin = -kmax;
       int nmin = -kmax;
       if( l==0 && m==0 ) nmin=1;
       for( int n=nmin; n<=kmax; n++ ) {
-        real_t ksq = l * l + m * m + n * n;
+        real ksq = l * l + m * m + n * n;
         if( ksq <= kmaxsq ) {
           Ewald ewald;
           ewald.K[0] = l;
 
   dft(ewalds,bodies,R0);
   for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
-    real_t R2 = norm(E->K);
-    real_t factor = coef * exp(-R2 * coef2) / R2;
+    real R2 = norm(E->K);
+    real factor = coef * exp(-R2 * coef2) / R2;
     E->REAL *= factor;
     E->IMAG *= factor;
   }

kernel/CPUP2P.cxx

 #pragma omp parallel for
   for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
     B_iter Bi = Ci->LEAF+i;                                     //  Target body iterator
-    real_t P0 = 0;                                              //  Initialize potential
+    real P0 = 0;                                                //  Initialize potential
     vect F0 = 0;                                                //  Initialize force
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
       vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
-      real_t R2 = norm(dist) + EPS2;                            //   R^2
-      real_t invR2 = 1.0 / R2;                                  //   1 / R^2
+      real R2 = norm(dist) + EPS2;                              //   R^2
+      real invR2 = 1.0 / R2;                                    //   1 / R^2
       if( R2 == 0 ) invR2 = 0;                                  //   Exclude self interaction
-      real_t invR = Bj->SRC * std::sqrt(invR2);                 //   potential
+      real invR = Bj->SRC * std::sqrt(invR2);                   //   potential
       dist *= invR2 * invR;                                     //   force
       P0 += invR;                                               //   accumulate potential
       F0 += dist;                                               //   accumulate force
     Bi->TRG[3] -= F0[2];                                        //  z component of force
   }                                                             // End loop over target bodies
 #else
-  real_t (* cbi)[10] =  (real_t (*)[10])(&(Ci->LEAF->IBODY));
-  real_t (* cbj)[10] =  (real_t (*)[10])(&(Cj->LEAF->IBODY));
-  real_t xp[3] = {Xperiodic[0],Xperiodic[1],Xperiodic[2]};
+  real (* cbi)[10] =  (real (*)[10])(&(Ci->LEAF->IBODY));
+  real (* cbj)[10] =  (real (*)[10])(&(Cj->LEAF->IBODY));
+  real xp[3] = {Xperiodic[0],Xperiodic[1],Xperiodic[2]};
   int ni = Ci->NDLEAF;
   int nj = Cj->NDLEAF;
   int i,j;
 #pragma loop norecurrence
   for(i=0;i<ni;i++){
     for(j=0;j<nj;j++){
-      real_t dist_x = cbi[i][2] - cbj[j][2] - xp[0];
-      real_t dist_y = cbi[i][3] - cbj[j][3] - xp[1];
-      real_t dist_z = cbi[i][4] - cbj[j][4] - xp[2];
-      real_t R2 = EPS2+dist_x*dist_x+dist_y*dist_y+dist_z*dist_z;
-      real_t invR = 1.0/sqrt(R2);
+      real dist_x = cbi[i][2] - cbj[j][2] - xp[0];
+      real dist_y = cbi[i][3] - cbj[j][3] - xp[1];
+      real dist_z = cbi[i][4] - cbj[j][4] - xp[2];
+      real R2 = EPS2+dist_x*dist_x+dist_y*dist_y+dist_z*dist_z;
+      real invR = 1.0/sqrt(R2);
       if( R2 == 0 ) invR = 0;
-      real_t invR3 = cbj[j][5] * invR * invR * invR;
+      real invR3 = cbj[j][5] * invR * invR * invR;
       cbi[i][6] += cbj[j][5] * invR;
       cbi[i][7] -= dist_x * invR3;
       cbi[i][8] -= dist_y * invR3;
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
       int atypej = int(Bj->SRC);                                //   Atom type of source
       vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
-      real_t R2 = norm(dist);                                   //   R squared
+      real R2 = norm(dist);                                     //   R squared
       if( R2 != 0 ) {                                           //   Exclude self interaction
-        real_t rs = RSCALE[atypei*ATOMS+atypej];                //    r scale
-        real_t gs = GSCALE[atypei*ATOMS+atypej];                //    g scale
-        real_t R2s = R2 * rs;                                   //    R^2 * r scale
+        real rs = RSCALE[atypei*ATOMS+atypej];                  //    r scale
+        real gs = GSCALE[atypei*ATOMS+atypej];                  //    g scale
+        real R2s = R2 * rs;                                     //    R^2 * r scale
         if( R2MIN <= R2 && R2 < R2MAX ) {                       //    Exclude outlier values
-          real_t invR2 = 1.0 / R2s;                             //     1 / R^2
-          real_t invR6 = invR2 * invR2 * invR2;                 //     1 / R^6
-          real_t dtmp = gs * invR6 * invR2 * (2.0 * invR6 - 1.0);//     g scale / R^2 * (2 / R^12 + 1 / R^6)
+          real invR2 = 1.0 / R2s;                               //     1 / R^2
+          real invR6 = invR2 * invR2 * invR2;                   //     1 / R^6
+          real dtmp = gs * invR6 * invR2 * (2.0 * invR6 - 1.0); //     g scale / R^2 * (2 / R^12 + 1 / R^6)
           Bi->TRG[0] += gs * invR6 * (invR6 - 1.0);             //     Van der Waals potential
           Bi->TRG[1] -= dist[0] * dtmp;                         //     x component of Van der Waals force
           Bi->TRG[2] -= dist[1] * dtmp;                         //     y component of Van der Waals force

kernel/CPUSphericalLaplace.cxx

 template<>
 void Kernel<Laplace>::P2M(C_iter Cj) {
   real Rmax = 0;
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Cj->LEAF; B!=Cj->LEAF+Cj->NCLEAF; ++B ) {
     vect dist = B->X - Cj->X;
     real R = std::sqrt(norm(dist));
 
 template<>
 void Kernel<Laplace>::M2M(C_iter Ci) {
-  const complex_t I(0.,1.);
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  const complex I(0.,1.);
+  complex Ynm[P*P], YnmTheta[P*P];
   real Rmax = Ci->RMAX;
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
     vect dist = Ci->X - Cj->X;
       for( int k=0; k<=j; ++k ) {
         int jk = j * j + j + k;
         int jks = j * (j + 1) / 2 + k;
-        complex_t M = 0;
+        complex M = 0;
         for( int n=0; n<=j; ++n ) {
           for( int m=-n; m<=std::min(k-1,n); ++m ) {
             if( j-n >= k-m ) {
 
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   vect dist = Ci->X - Cj->X - Xperiodic;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
     for( int k=0; k<=j; ++k ) {
       int jk = j * j + j + k;
       int jks = j * (j + 1) / 2 + k;
-      complex_t L = 0;
+      complex L = 0;
       for( int n=0; n!=P-j; ++n ) {
         for( int m=-n; m<0; ++m ) {
           int nm   = n * n + n + m;
 
 template<>
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
-  const complex_t I(0.,1.);                                       // Imaginary unit
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  const complex I(0.,1.);                                       // Imaginary unit
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Cj->X - Xperiodic;
     vect spherical = 0;
 
 template<>
 void Kernel<Laplace>::L2L(C_iter Ci) const {
-  const complex_t I(0.,1.);
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  const complex I(0.,1.);
+  complex Ynm[P*P], YnmTheta[P*P];
   C_iter Cj = Ci0 + Ci->PARENT;
   vect dist = Ci->X - Cj->X;
   real rho, alpha, beta;
     for( int k=0; k<=j; ++k ) {
       int jk = j * j + j + k;
       int jks = j * (j + 1) / 2 + k;
-      complex_t L = 0;
+      complex L = 0;
       for( int n=j; n!=P; ++n ) {
         for( int m=j+k-n; m<0; ++m ) {
           int jnkm = (n - j) * (n - j) + n - j + m - k;
 
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
-  const complex_t I(0.,1.);                                       // Imaginary unit
-  complex_t Ynm[P*P], YnmTheta[P*P];
+  const complex I(0.,1.);                                       // Imaginary unit
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
     vect dist = B->X - Ci->X;
     vect spherical = 0;

kernel/GPUEvaluator.cxx

       int begin = targetBegin[Ci];                              //   Offset of target coefs
       if( isM ) {                                               //   If target is M
         for( int i=0; i!=NTERM; ++i ) {                         //    Loop over coefs in target cell
-          Ci->M[i] += (targetHost[begin+1*i+0],targetHost[begin+2*i+1]);// Copy target values from GPU buffer
+          Ci->M[i].real() += targetHost[begin+2*i+0];           //     Copy real target values from GPU buffer
+          Ci->M[i].imag() += targetHost[begin+2*i+1];           //     Copy imaginary target values from GPU buffer
         }                                                       //    End loop over coefs
       } else {                                                  //   If target is L
         for( int i=0; i!=NTERM; ++i ) {                         //    Loop over coefs in target cell
-          Ci->L[i] += (targetHost[begin+2*i+0],targetHost[begin+2*i+1]);// Copy target values from GPU buffer
+          Ci->L[i].real() += targetHost[begin+2*i+0];           //     Copy real target values from GPU buffer
+          Ci->L[i].imag() += targetHost[begin+2*i+1];           //     Copy imaginary target values from GPU buffer
         }                                                       //    End loop over coefs
       }                                                         //   Endif for target type
       lists[Ci-Ci0].clear();                                    //   Clear interaction list
 template<Equation equation>
 void Evaluator<equation>::evalP2P(Bodies &ibodies, Bodies &jbodies, bool onCPU) {// Evaluate all P2P kernels
   int prange = getPeriodicRange();                              // Get range of periodic images
-  int numImages = std::pow(real_t(2 * prange + 1),3);           // Get number of periodic images
+  int numImages = pow(2 * prange + 1,3);                        // Get number of periodic images
   int numIcall = int(ibodies.size()-1)/MAXBODY+1;               // Number of icall loops
   int numJcall = int(jbodies.size()-1)/(MAXBODY/numImages)+1;   // Number of jcall loops
   int ioffset = 0;                                              // Initialzie offset for icall loops

kernel/GPUEwaldLaplace.cu

   stopTimer("EwaldReal GPU");\
 }
 
-__global__ void dft(gpureal *ewaldsGlob, gpureal *bodiesGlob, const int numEwalds, const int numBodies, const real_t R0) {
+__global__ void dft(gpureal *ewaldsGlob, gpureal *bodiesGlob, const int numEwalds, const int numBodies, const real R0) {
   int i = blockIdx.x * THREADS + threadIdx.x;
   gpureal scale = M_PI / R0;
   gpureal REAL = 0, IMAG = 0;
   ewaldsGlob[5*i+4] = IMAG;
 }
 
-__global__ void idft(gpureal *ewaldsGlob, gpureal *bodiesGlob, const int numEwalds, const int numBodies, const real_t R0) {
+__global__ void idft(gpureal *ewaldsGlob, gpureal *bodiesGlob, const int numEwalds, const int numBodies, const real R0) {
   int i = blockIdx.x * THREADS + threadIdx.x;
   gpureal scale = M_PI / R0;
   gpureal TRG[4] = {0,0,0,0};
 
 __global__ void factor(gpureal *ewaldsGlob, const gpureal coef, const gpureal coef2) {
   int i = blockIdx.x * THREADS + threadIdx.x;
-  real_t R2 = ewaldsGlob[5*i+0] * ewaldsGlob[5*i+0]
-            + ewaldsGlob[5*i+1] * ewaldsGlob[5*i+1]
-            + ewaldsGlob[5*i+2] * ewaldsGlob[5*i+2];
-  real_t factor = coef * expf(-R2 * coef2) / R2;
-  ewaldsGlob[4*i+3] *= factor;
+  real R2 = ewaldsGlob[5*i+0] * ewaldsGlob[5*i+0]
+          + ewaldsGlob[5*i+1] * ewaldsGlob[5*i+1]
+          + ewaldsGlob[5*i+2] * ewaldsGlob[5*i+2];
+  real factor = coef * expf(-R2 * coef2) / R2;
+  ewaldsGlob[5*i+3] *= factor;
   ewaldsGlob[5*i+4] *= factor;
 }
 
 template<>
 void Kernel<Laplace>::EwaldWave(Bodies &bodies) const {     // Ewald wave part on CPU
-  real_t scale = M_PI / R0;
-  real_t coef = .25 / M_PI / M_PI / SIGMA / R0;
-  real_t coef2 = scale * scale / (4 * ALPHA * ALPHA);
+  real scale = M_PI / R0;
+  real coef = .25 / M_PI / M_PI / SIGMA / R0;
+  real coef2 = scale * scale / (4 * ALPHA * ALPHA);
 
   Ewalds ewalds;
-  real_t kmaxsq = KSIZE * KSIZE;
+  real kmaxsq = KSIZE * KSIZE;
   int kmax = KSIZE;
   for( int l=0; l<=kmax; l++ ) {
     int mmin = -kmax;
       int nmin = -kmax;
       if( l==0 && m==0 ) nmin=1;
       for( int n=nmin; n<=kmax; n++ ) {
-        real_t ksq = l * l + m * m + n * n;
+        real ksq = l * l + m * m + n * n;
         if( ksq <= kmaxsq ) {
           Ewald ewald;
           ewald.K[0] = l;

unit_test/Nparallel.cxx

     if(printNow) FMM.writeTime();                               //  Write timings of all events to file
     if(printNow) FMM.resetTimer();                              //  Erase all events in timer
 
-    real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+    real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
     FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);      //  Evaluate error on the reduced set of bodies
     MPI_Datatype MPI_TYPE = FMM.getType(diff1);                 //  Get MPI datatype
     MPI_Reduce(&diff1,&diff3,1,MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD);// Reduce difference in potential

unit_test/check_gpus.cxx

   FMM.evalP2P(bodies2,bodies2,onCPU);                           // Direct summation on CPU
   FMM.stopTimer("Direct CPU",FMM.printNow);                     // Stop timer
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error
 
   for( int irank=0; irank!=MPISIZE; ++irank ) {                 // Loop over MPI ranks

unit_test/direct_gpu.cxx

   FMM.evalP2P(bodies2,jbodies,onCPU);                           // Direct summation on CPU
   FMM.stopTimer("Direct CPU",FMM.printNow);                     // Stop timer
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   FMM.printError(diff1,norm1,diff2,norm2);                      // Print the L2 norm error
   FMM.postCalculation();                                        // Kernel post-processing

unit_test/ewald_direct.cxx

 int main() {
   const int numBodies = 1000;                                   // Number of bodies
   const int numTarget = 100;                                    // Number of target points to be used for error eval
-  const real_t xmax = 100.0;                                    // Size of domain
-  const real_t ksize = 44.0;                                    // Ewald wave number
-  const real_t alpha = 0.2;                                     // Ewald alpha value
-  const real_t sigma = .25 / M_PI;                              // Ewald sigma value
+  const real xmax = 100.0;                                      // Size of domain
+  const real ksize = 44.0;                                      // Ewald wave number
+  const real alpha = 0.2;                                       // Ewald alpha value
+  const real sigma = .25 / M_PI;                                // Ewald sigma value
   IMAGES = 3;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 1 / sqrt(4);                                          // Multipole acceptance criteria
   Bodies bodies(numBodies);                                     // Define vector of bodies
 
   FMM.startTimer("Set bodies");                                 // Start timer
   srand48(2);                                                   // Seed for random number generator
-  real_t average = 0;                                           // Initialize average charge
+  real average = 0;                                             // Initialize average charge
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {        // Loop over bodies
     for( int d=0; d!=3; ++d ) {                                 //  Loop over dimensions
       B->X[d] = drand48() * xmax;                               //   Initialize positions
   FMM.stopTimer("Direct sum",FMM.printNow);                     // Stop timer
   FMM.eraseTimer("Direct sum");                                 // Erase entry from timer to avoid timer overlap
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;            // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.evalError(bodies,FMM.buffer,diff1,norm1,diff2,norm2,true);// Evaluate error
   FMM.printError(diff1,norm1,diff2,norm2);                      // Print the L2 norm error
   FMM.finalize();                                               // Finalize FMM

unit_test/ewald_fmm.cxx

 
 int main() {
   const int numBodies = 1000;                                   // Number of bodies
-  const real_t xmax = 100.0;                                    // Size of domain
-  const real_t ksize = 44.0;                                    // Ewald wave number
-  const real_t alpha = 0.2;                                     // Ewald alpha value
-  const real_t sigma = .25 / M_PI;                              // Ewald sigma value
+  const real xmax = 100.0;                                      // Size of domain
+  const real ksize = 44.0;                                      // Ewald wave number
+  const real alpha = 0.2;                                       // Ewald alpha value
+  const real sigma = .25 / M_PI;                                // Ewald sigma value
   IMAGES = 8;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 1 / sqrt(4);                                          // Multipole acceptance criteria
   Bodies bodies(numBodies);                                     // Define vector of bodies
 
   FMM.startTimer("Set bodies");                                 // Start timer
   srand48(0);                                                   // Seed for random number generator
-  real_t average = 0;                                           // Initialize average charge
+  real average = 0;                                             // Initialize average charge
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {        // Loop over bodies
     for( int d=0; d!=3; ++d ) {                                 //  Loop over dimensions
       B->X[d] = drand48() * xmax;                               //   Initialize positions
   FMM.stopTimer("Downward",FMM.printNow);                       // Stop timer
   FMM.eraseTimer("Downward");                                   // Erase entry from timer to avoid timer overlap
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;            // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2,true);   // Evaluate error
   FMM.printError(diff1,norm1,diff2,norm2);                      // Print the L2 norm error
   FMM.finalize();                                               // Finalize FMM

unit_test/ijparallelrun.cxx

   if(FMM.printNow) FMM.writeTime();                             // Write timings of all events to file
   if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   MPI_Datatype MPI_TYPE = FMM.getType(diff1);                   // Get MPI datatype
   MPI_Reduce(&diff1,&diff3,1,MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD);// Reduce difference in potential

unit_test/kernel.cxx

   FMM.preCalculation();                                         // Kernel pre-processing
 
   for( int it=0; it!=5; ++it ) {                                // Loop over kernel iterations
-    real_t dist = (1 << it) / 2;                                //  Distance between source and target
+    real dist = (1 << it) / 2;                                  //  Distance between source and target
     for( B_iter B=ibodies.begin(); B!=ibodies.end(); ++B ) {    //  Loop over target bodies
       for( int d=0; d!=3; ++d ) {                               //   Loop over dimensions
         B->X[d] = -drand48() - dist;                            //    Initialize positions
     FMM.initTarget(ibodies2,IeqJ);                              //  Reinitialize target values
     FMM.evalP2P(ibodies2,jbodies);                              //  Evaluate P2P kernel
 
-    real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;          //  Initialize accumulators
+    real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;            //  Initialize accumulators
     FMM.evalError(ibodies,ibodies2,diff1,norm1,diff2,norm2);    //  Evaluate error
     std::cout << "Distance      : " << dist << std::endl;       //  Print distance between target and source
     FMM.printError(diff1,norm1,diff2,norm2);                    //  Print the L2 norm error

unit_test/overlap_comm.cxx

   if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
 
 #ifndef VTK
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
   bodies.resize(numTarget);                                     // Shrink target bodies vector to save time
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   MPI_Datatype MPI_TYPE = FMM.getType(diff1);                   // Get MPI datatype

unit_test/parallelrun.cxx

 #endif
 
 int main() {
-const int numBodies = 10000;                                  // Number of bodies
-const int numTarget = 100;                                    // Number of target points to be used for error eval
-IMAGES = 1;                                                   // Level of periodic image tree (0 for non-periodic)
-THETA = 0.5;                                                  // Multipole acceptance criteria
-Bodies bodies(numBodies);                                     // Define vector of bodies
-Bodies jbodies;                                               // Define vector of source bodies
-Cells cells, jcells;                                          // Define vector of cells
-ParallelFMM<Laplace> FMM;                                     // Instantiate ParallelFMM class
-FMM.initialize();                                             // Initialize FMM
-if( MPIRANK == 0 ) FMM.printNow = true;                       // Print only if MPIRANK == 0
+  const int numBodies = 10000;                                  // Number of bodies
+  const int numTarget = 100;                                    // Number of target points to be used for error eval
+  IMAGES = 1;                                                   // Level of periodic image tree (0 for non-periodic)
+  THETA = 1 / sqrtf(4);                                         // Multipole acceptance criteria
+  Bodies bodies(numBodies);                                     // Define vector of bodies
+  Bodies jbodies;                                               // Define vector of source bodies
+  Cells cells, jcells;                                          // Define vector of cells
+  ParallelFMM<Laplace> FMM;                                     // Instantiate ParallelFMM class
+  FMM.initialize();                                             // Initialize FMM
+  if( MPIRANK == 0 ) FMM.printNow = true;                       // Print only if MPIRANK == 0
 
-FMM.startTimer("Set bodies");                                 // Start timer
-FMM.cube(bodies,MPIRANK+1);                                   // Initialize bodies in a cube
-FMM.stopTimer("Set bodies",FMM.printNow);                     // Stop timer
+  FMM.startTimer("Set bodies");                                 // Start timer
+  FMM.cube(bodies,MPIRANK+1);                                   // Initialize bodies in a cube
+  FMM.stopTimer("Set bodies",FMM.printNow);                     // Stop timer
 
-FMM.startTimer("Set domain");                                 // Start timer
-FMM.setGlobDomain(bodies);                                    // Set global domain size of FMM
-FMM.stopTimer("Set domain",FMM.printNow);                     // Stop timer
+  FMM.startTimer("Set domain");                                 // Start timer
+  FMM.setGlobDomain(bodies);                                    // Set global domain size of FMM
+  FMM.stopTimer("Set domain",FMM.printNow);                     // Stop timer
 
-FMM.octsection(bodies);                                       // Partition domain and redistribute bodies
+  FMM.octsection(bodies);                                       // Partition domain and redistribute bodies
 
 #ifdef TOPDOWN
-FMM.topdown(bodies,cells);                                    // Tree construction (top down) & upward sweep
+  FMM.topdown(bodies,cells);                                    // Tree construction (top down) & upward sweep
 #else
-FMM.bottomup(bodies,cells);                                   // Tree construction (bottom up) & upward sweep
+  FMM.bottomup(bodies,cells);                                   // Tree construction (bottom up) & upward sweep
 #endif
 
-FMM.commBodies(cells);                                        // Send bodies (not receiving yet)
+  FMM.commBodies(cells);                                        // Send bodies (not receiving yet)
 
-jbodies = bodies;                                             // Vector of source bodies
-jcells = cells;                                               // Vector of source cells
-FMM.commCells(jbodies,jcells);                                // Communicate cells (receive bodies here)
+  jbodies = bodies;                                             // Vector of source bodies
+  jcells = cells;                                               // Vector of source cells
+  FMM.commCells(jbodies,jcells);                                // Communicate cells (receive bodies here)
 
-FMM.startTimer("Downward");                                   // Start timer
-FMM.downward(cells,jcells);                                   // Downward sweep
-FMM.stopTimer("Downward",FMM.printNow);                       // Stop timer
-FMM.eraseTimer("Downward");                                   // Erase entry from timer to avoid timer overlap
+  FMM.startTimer("Downward");                                   // Start timer
+  FMM.downward(cells,jcells);                                   // Downward sweep
+  FMM.stopTimer("Downward",FMM.printNow);                       // Stop timer
+  FMM.eraseTimer("Downward");                                   // Erase entry from timer to avoid timer overlap
 
 #ifndef VTK
-FMM.startTimer("Direct sum");                                 // Start timer
-jbodies = bodies;                                             // Copy source bodies
-FMM.sampleBodies(bodies,numTarget);                           // Shrink target bodies vector to save time
-Bodies bodies2 = bodies;                                      // Define new bodies vector for direct sum
-FMM.initTarget(bodies2);                                      // Reinitialize target values
-for( int i=0; i!=MPISIZE; ++i ) {                             // Loop over all MPI processes
-FMM.shiftBodies(jbodies);                                   //  Communicate bodies round-robin
-FMM.evalP2P(bodies2,jbodies);                           //  Direct summation between bodies2 and jbodies2
-if(FMM.printNow) std::cout << "Direct loop   : " << i+1 << "/" << MPISIZE << std::endl;// Print loop counter
-}                                                             // End loop over all MPI processes
-FMM.stopTimer("Direct sum",FMM.printNow);                     // Stop timer
-FMM.eraseTimer("Direct sum");                                 // Erase entry from timer to avoid timer overlap
-if(FMM.printNow) FMM.writeTime();                             // Write timings of all events to file
-if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
+  FMM.startTimer("Direct sum");                                 // Start timer
+  jbodies = bodies;                                             // Copy source bodies
+  FMM.sampleBodies(bodies,numTarget);                           // Shrink target bodies vector to save time
+  Bodies bodies2 = bodies;                                      // Define new bodies vector for direct sum
+  FMM.initTarget(bodies2);                                      // Reinitialize target values
+  for( int i=0; i!=MPISIZE; ++i ) {                             // Loop over all MPI processes
+    FMM.shiftBodies(jbodies);                                   //  Communicate bodies round-robin
+    FMM.evalP2P(bodies2,jbodies);                           //  Direct summation between bodies2 and jbodies2
+    if(FMM.printNow) std::cout << "Direct loop   : " << i+1 << "/" << MPISIZE << std::endl;// Print loop counter
+  }                                                             // End loop over all MPI processes
+  FMM.stopTimer("Direct sum",FMM.printNow);                     // Stop timer
+  FMM.eraseTimer("Direct sum");                                 // Erase entry from timer to avoid timer overlap
+  if(FMM.printNow) FMM.writeTime();                             // Write timings of all events to file
+  if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
 
-real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   MPI_Datatype MPI_TYPE = FMM.getType(diff1);                   // Get MPI datatype
   MPI_Reduce(&diff1,&diff3,1,MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD);// Reduce difference in potential

unit_test/poisson.cxx

   const int level = 4;                                          // Depth of tree
   const int numGrid = k * (1 << level);                         // Number of grid points in one direction
   const int numBodies = numGrid * numGrid * numGrid;            // Number of bodies
-  const real_t L = 10;                                          // Size of Gaussian distribution
+  const real L = 10;                                            // Size of Gaussian distribution
   std::cout << "N             : " << numBodies << std::endl;    // Print number of bodies
   IMAGES = 0;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 1 / sqrt(4);                                          // Multipole acceptance criteria
   assert( NCRIT == k*k*k );                                     // NCRIT must be set to k^3 for this app.
 
   FMM.startTimer("Set bodies");                                 // Start timer
-  real_t dV = 8. / numBodies;                                   // Volume per body
+  real dV = 8. / numBodies;                                     // Volume per body
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {        // Loop over bodies
     int i = B-bodies.begin();                                   //  Loop counter
     int ix = i / numGrid / numGrid;                             //  x index
   FMM.stopTimer("Downward",FMM.printNow);                       // Stop timer
   FMM.eraseTimer("Downward");                                   // Erase entry from timer to avoid timer overlap
 
-  real_t diff1 = 0, norm1 = 0;                                  // Initialize accumulators
+  real diff1 = 0, norm1 = 0;                                    // Initialize accumulators
   for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {        // Loop over bodies
-    real_t exact = exp(-L * norm(B->X));                        //  Analytical solution
+    real exact = exp(-L * norm(B->X));                          //  Analytical solution
     diff1 += (B->TRG[0] - exact) * (B->TRG[0] - exact);         //  Difference between analytical solution
     norm1 += exact * exact;                                     //  L2 norm of analytical solution
   }                                                             // End loop over bodies

unit_test/serialrun.cxx

   FMM.writeTime();                                              // Write timings of all events to file
   FMM.writeTime();                                              // Write again to have at least two data sets
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;            // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.evalError(bodies,FMM.buffer,diff1,norm1,diff2,norm2);     // Evaluate error on the reduced set of bodies
   FMM.printError(diff1,norm1,diff2,norm2);                      // Print the L2 norm error
 #else

unit_test/skip_tree.cxx

   if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
 
 #ifndef VTK
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
   bodies.resize(numTarget);                                     // Shrink target bodies vector to save time
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   MPI_Datatype MPI_TYPE = FMM.getType(diff1);                   // Get MPI datatype

unit_test/unpartition.cxx

   if(FMM.printNow) FMM.writeTime();                             // Write timings of all events to file
   if(FMM.printNow) FMM.writeTime();                             // Write again to have at least two data sets
 
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
   FMM.sampleBodies(bodies,numTarget);                           // Shrink target bodies vector to save time
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   MPI_Datatype MPI_TYPE = FMM.getType(diff1);                   // Get MPI datatype

unit_test/unsort.cxx

   FMM.writeTime();                                              // Write again to have at least two data sets
 
 #ifndef VTK
-  real_t diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
+  real diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;              // Initialize accumulators
   FMM.sampleBodies(bodies,numTarget);                           // Shrink target bodies vector to save time
   FMM.evalError(bodies,bodies2,diff1,norm1,diff2,norm2);        // Evaluate error on the reduced set of bodies
   FMM.printError(diff1,norm1,diff2,norm2);                      // Print the L2 norm error