Commits

Rio Yokota committed 9316ff0

Revert SphericalM2L to P -> 2 * P.
Parameters adjusted for high accuracy Ewald-FMM test.

Comments (0)

Files changed (10)

 EXPAND  = Spherical
 
 ### GCC compiler
-CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
+CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -fopenmp -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 ### Intel compiler
 #CXX	= mpicxx -Wall -xHOST -O3 -openmp -funroll-loops -finline-functions -fPIC -ansi-alias -I../include
 
     real fact = 1;                                              // Initialize 2 * m + 1
     real pn = 1;                                                // Initialize Legendre polynomial Pn
     real rhom = 1.0 / rho;                                      // Initialize rho^(-m-1)
-    for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
+    for( int m=0; m!=2*P; ++m ) {                               // Loop over m in Ynm
       complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
       real p = pn;                                              //  Associated Legendre polynomial Pnm
       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
       rhom /= rho;                                              //  rho^(-m-1)
       real rhon = rhom;                                         //  rho^(-n-1)
-      for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
+      for( int n=m+1; n!=2*P; ++n ) {                           //  Loop over n in Ynm
         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm for m > 0
   void preCalculation() {
     const complex I(0.,1.);                                     // Imaginary unit
     factorial = new real  [P];                                  // Factorial
-    prefactor = new real  [P*P];                                // sqrt( (n - |m|)! / (n + |m|)! )
-    Anm       = new real  [P*P];                                // (-1)^n / sqrt( (n + m)! / (n - m)! )
+    prefactor = new real  [4*P*P];                              // sqrt( (n - |m|)! / (n + |m|)! )
+    Anm       = new real  [4*P*P];                              // (-1)^n / sqrt( (n + m)! / (n - m)! )
     Cnm       = new complex [P*P*P*P];                          // M2L translation matrix Cjknm
 
     factorial[0] = 1;                                           // Initialize factorial
       factorial[n] = factorial[n-1] * n;                        //  n!
     }                                                           // End loop to P
 
-    for( int n=0; n!=P; ++n ) {                                 // Loop over n in Anm
+    for( int n=0; n!=2*P; ++n ) {                               // Loop over n in Anm
       for( int m=-n; m<=n; ++m ) {                              //  Loop over m in Anm
         int nm = n*n+n+m;                                       //   Index of Anm
         int nabsm = abs(m);                                     //   |m|
 #endif
 #endif
 
-const int  P        = 15;                                       //!< Order of expansions
+const int  P        = 25;                                       //!< Order of expansions
 const int  NCRIT    = 64;                                       //!< Number of bodies per cell
 const int  MAXBODY  = 50000;                                    //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel

kernel/CPUEvaluator.cxx

 void Evaluator<equation>::evalEwaldReal(Cells &cells) {         // Evaluate queued Ewald real kernels
   startTimer("evalEwaldReal");                                  // Start timer
   Ci0 = cells.begin();                                          // Set begin iterator
-  for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) {       // Loop over cells
-    while( !listP2P[Ci-Ci0].empty() ) {                         //  While M2P interaction list is not empty
-      C_iter Cj = listP2P[Ci-Ci0].back();                       //   Set source cell iterator
+#pragma omp parallel for
+  for( int i=0; i<int(cells.size()); ++i ) {                    // Loop over cells
+    C_iter Ci = Ci0 + i;                                        //  Target cell iterator
+    while( !listP2P[i].empty() ) {                              //  While M2P interaction list is not empty
+      C_iter Cj = listP2P[i].back();                            //   Set source cell iterator
       EwaldReal(Ci,Cj);                                         //   Perform Ewald real kernel
-      listP2P[Ci-Ci0].pop_back();                               //   Pop last element from M2P interaction list
+      listP2P[i].pop_back();                                    //   Pop last element from M2P interaction list
     }                                                           //  End while for M2P interaction list
   }                                                             // End loop over cells topdown
   listP2P.clear();                                              // Clear interaction lists

kernel/CPUEwaldLaplace.cxx

 namespace {
 void dft(Ewalds &ewalds, Bodies &bodies, real R0) {
   real scale = M_PI / R0;
-  for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
+#pragma omp parallel for
+  for( int i=0; i<int(ewalds.size()); ++i ) {
+    E_iter E = ewalds.begin() + i;
     E->REAL = E->IMAG = 0;
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
       real th = 0;
 
 void idft(Ewalds &ewalds, Bodies &bodies, real R0) {
   real scale = M_PI / R0;
-  for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
+#pragma omp parallel for
+  for( int i=0; i<int(bodies.size()); ++i ) {
+    B_iter B = bodies.begin() + i;
     vec<4,real> TRG = 0;
     for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
       real th = 0;
 
 template<>
 void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real part on CPU
-  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
+  for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
+    B_iter Bi = Ci->LEAF + i;                                   //  Target body iterator
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
       vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       for( int d=0; d<3; d++ ) {                                //   Loop over dimensions

kernel/CPUP2P.cxx

 template<>
 void Kernel<Laplace>::P2P(C_iter Ci, C_iter Cj) const {         // Laplace P2P kernel on CPU
 #ifndef SPARC_SIMD
-//#pragma omp parallel for
+#pragma omp parallel for
   for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
     B_iter Bi = Ci->LEAF+i;                                     //  Target body iterator
     real P0 = 0;                                                //  Initialize potential

kernel/CPUSphericalLaplace.cxx

 template<>
 void Kernel<Laplace>::P2M(C_iter Cj) {
   real Rmax = 0;
-  complex Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Cj->LEAF; B!=Cj->LEAF+Cj->NDLEAF; ++B ) {
     vect dist = B->X - Cj->X;
     real R = std::sqrt(norm(dist));
 template<>
 void Kernel<Laplace>::M2M(C_iter Ci) {
   const complex I(0.,1.);
-  complex Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[4*P*P], YnmTheta[4*P*P];
   real Rmax = Ci->RMAX;
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
     vect dist = Ci->X - Cj->X;
 
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
-  complex Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[4*P*P], YnmTheta[4*P*P];
   vect dist = Ci->X - Cj->X - Xperiodic;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
       int jk = j * j + j + k;
       int jks = j * (j + 1) / 2 + k;
       complex L = 0;
-      for( int n=0; n!=P-j; ++n ) {
+      for( int n=0; n!=P; ++n ) {
         for( int m=-n; m<0; ++m ) {
           int nm   = n * n + n + m;
           int nms  = n * (n + 1) / 2 - m;
 template<>
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
   const complex I(0.,1.);                                       // Imaginary unit
-  complex Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Cj->X - Xperiodic;
     vect spherical = 0;
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
   const complex I(0.,1.);                                       // Imaginary unit
-  complex Ynm[P*P], YnmTheta[P*P];
+  complex Ynm[4*P*P], YnmTheta[4*P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Ci->X;
     vect spherical = 0;

kernel/GPUSphericalLaplace.cu

   gpureal x = cosf(alpha);                                      // x = cos(alpha)
   gpureal y = sinf(alpha);                                      // y = sin(alpha)
   gpureal rho_1 = 1 / rho;                                      // 1 / rho
-  for( int l=threadIdx.x; l<(P+1)*P/2; l+=THREADS ) {           // Loop over coefficients in Ynm
+  for( int l=threadIdx.x; l<(2*P+1)*P/2; l+=THREADS ) {         // Loop over coefficients in Ynm
     gpureal fact = 1;                                           //  Initialize 2 * m + 1
     gpureal pn = 1;                                             //  Initialize Legendre polynomial Pn
     gpureal rhom = rho_1;                                       //  Initialize rho^(-m-1)
   k = threadIdx.x - k;
   if( threadIdx.x >= NTERM ) j = k = 0;
   gpureal ajk = ODDEVEN(j) * rsqrtf(factShrd[j-k] * factShrd[j+k]);
-  for( int n=0; n<P-j; ++n ) {
+  for( int n=0; n<P; ++n ) {
     for( int m=-n; m<0; ++m ) {
       int jnkm = (j + n) * (j + n + 1) / 2 - m + k;
       gpureal ere = cosf((m - k) * beta);
   gpureal target[2] = {0, 0};
   __shared__ gpureal sourceShrd[2*THREADS];
   __shared__ gpureal factShrd[2*P];
-  __shared__ gpureal YnmShrd[NTERM];
+  __shared__ gpureal YnmShrd[4*NTERM];
   gpureal fact = EPS;
   for( int i=0; i<2*P; ++i ) {
     factShrd[i] = fact;

unit_test/ewald_direct.cxx

   const int numBodies = 1000;                                   // Number of bodies
   const int numTarget = 100;                                    // Number of target points to be used for error eval
   const real xmax = 100.0;                                      // Size of domain
-  const real ksize = 11.0;                                      // Ewald wave number
-  const real alpha = 0.1;                                       // Ewald alpha value
+  const real ksize = 44.0;                                      // Ewald wave number
+  const real alpha = 0.2;                                       // Ewald alpha value
   const real sigma = .25 / M_PI;                                // Ewald sigma value
   IMAGES = 3;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 1 / sqrt(4);                                          // Multipole acceptance criteria

unit_test/ewald_fmm.cxx

 int main() {
   const int numBodies = 1000;                                   // Number of bodies
   const real xmax = 100.0;                                      // Size of domain
-  const real ksize = 11.0;                                      // Ewald wave number
-  const real alpha = 0.1;                                       // Ewald alpha value
+  const real ksize = 44.0;                                      // Ewald wave number
+  const real alpha = 0.2;                                       // Ewald alpha value
   const real sigma = .25 / M_PI;                                // Ewald sigma value
-  IMAGES = 3;                                                   // Level of periodic image tree (0 for non-periodic)
+  IMAGES = 8;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 1 / sqrt(4);                                          // Multipole acceptance criteria
   Bodies bodies(numBodies);                                     // Define vector of bodies
   Bodies jbodies;                                               // Define vector of source bodies