Commits

Rio Yokota committed 6486791

SphericalM2L kernel 2*P -> P loop.

  • Participants
  • Parent commits 5779782

Comments (0)

Files changed (4)

File Makefile.include

 .PHONY: docs
 
 ### CUDA path
-CUDA_INSTALL_PATH = /usr/local/cuda4.1/cuda
+CUDA_INSTALL_PATH = /usr/local/cuda
 
 ### VTK path
 VTK_INCLUDE_PATH = /usr/include/vtk-5.8
 # -Kdalign,ns,mfunc,eval,prefetch_conditional,ilfunc -x-
 
 ### CUDA compiler
-NVCC    = nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3\
+NVCC    = nvcc --compiler-bindir=/usr/bin/gcc-4.4 -Xcompiler -fopenmp --ptxas-options=-v -O3\
 	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include
 
 ### Base flags

File include/kernel.h

     real fact = 1;                                              // Initialize 2 * m + 1
     real pn = 1;                                                // Initialize Legendre polynomial Pn
     real rhom = 1.0 / rho;                                      // Initialize rho^(-m-1)
-    for( int m=0; m!=2*P; ++m ) {                               // Loop over m in Ynm
+    for( int m=0; m!=P; ++m ) {                                 // Loop over m in Ynm
       complex eim = std::exp(I * real(m * beta));               //  exp(i * m * beta)
       real p = pn;                                              //  Associated Legendre polynomial Pnm
       int npn = m * m + 2 * m;                                  //  Index of Ynm for m > 0
       YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * prefactor[npn] * eim;// theta derivative of r^n * Ynm
       rhom /= rho;                                              //  rho^(-m-1)
       real rhon = rhom;                                         //  rho^(-n-1)
-      for( int n=m+1; n!=2*P; ++n ) {                           //  Loop over n in Ynm
+      for( int n=m+1; n!=P; ++n ) {                             //  Loop over n in Ynm
         int npm = n * n + n + m;                                //   Index of Ynm for m > 0
         int nmm = n * n + n - m;                                //   Index of Ynm for m < 0
         Ynm[npm] = rhon * p * prefactor[npm] * eim;             //   rho^n * Ynm for m > 0
   void preCalculation() {
     const complex I(0.,1.);                                     // Imaginary unit
     factorial = new real  [P];                                  // Factorial
-    prefactor = new real  [4*P*P];                              // sqrt( (n - |m|)! / (n + |m|)! )
-    Anm       = new real  [4*P*P];                              // (-1)^n / sqrt( (n + m)! / (n - m)! )
+    prefactor = new real  [P*P];                                // sqrt( (n - |m|)! / (n + |m|)! )
+    Anm       = new real  [P*P];                                // (-1)^n / sqrt( (n + m)! / (n - m)! )
     Cnm       = new complex [P*P*P*P];                          // M2L translation matrix Cjknm
 
     factorial[0] = 1;                                           // Initialize factorial
       factorial[n] = factorial[n-1] * n;                        //  n!
     }                                                           // End loop to P
 
-    for( int n=0; n!=2*P; ++n ) {                               // Loop over n in Anm
+    for( int n=0; n!=P; ++n ) {                                 // Loop over n in Anm
       for( int m=-n; m<=n; ++m ) {                              //  Loop over m in Anm
         int nm = n*n+n+m;                                       //   Index of Anm
         int nabsm = abs(m);                                     //   |m|

File kernel/CPUSphericalLaplace.cxx

 template<>
 void Kernel<Laplace>::P2M(C_iter Cj) {
   real Rmax = 0;
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Cj->LEAF; B!=Cj->LEAF+Cj->NCLEAF; ++B ) {
     vect dist = B->X - Cj->X;
     real R = std::sqrt(norm(dist));
     evalMultipole(rho,alpha,-beta,Ynm,YnmTheta);
     for( int n=0; n!=P; ++n ) {
       for( int m=0; m<=n; ++m ) {
-        const int nm  = n * n + n + m;
-        const int nms = n * (n + 1) / 2 + m;
+        int nm  = n * n + n + m;
+        int nms = n * (n + 1) / 2 + m;
         Cj->M[nms] += B->SRC * Ynm[nm];
       }
     }
 template<>
 void Kernel<Laplace>::M2M(C_iter Ci) {
   const complex I(0.,1.);
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   real Rmax = Ci->RMAX;
   for( C_iter Cj=Cj0+Ci->CHILD; Cj!=Cj0+Ci->CHILD+Ci->NCHILD; ++Cj ) {
     vect dist = Ci->X - Cj->X;
     evalMultipole(rho,alpha,-beta,Ynm,YnmTheta);
     for( int j=0; j!=P; ++j ) {
       for( int k=0; k<=j; ++k ) {
-        const int jk = j * j + j + k;
-        const int jks = j * (j + 1) / 2 + k;
+        int jk = j * j + j + k;
+        int jks = j * (j + 1) / 2 + k;
         complex M = 0;
         for( int n=0; n<=j; ++n ) {
           for( int m=-n; m<=std::min(k-1,n); ++m ) {
             if( j-n >= k-m ) {
-              const int jnkm  = (j - n) * (j - n) + j - n + k - m;
-              const int jnkms = (j - n) * (j - n + 1) / 2 + k - m;
-              const int nm    = n * n + n + m;
+              int jnkm  = (j - n) * (j - n) + j - n + k - m;
+              int jnkms = (j - n) * (j - n + 1) / 2 + k - m;
+              int nm    = n * n + n + m;
               M += Cj->M[jnkms] * std::pow(I,real(m-abs(m))) * Ynm[nm]
                  * real(ODDEVEN(n) * Anm[nm] * Anm[jnkm] / Anm[jk]);
             }
           }
           for( int m=k; m<=n; ++m ) {
             if( j-n >= m-k ) {
-              const int jnkm  = (j - n) * (j - n) + j - n + k - m;
-              const int jnkms = (j - n) * (j - n + 1) / 2 - k + m;
-              const int nm    = n * n + n + m;
+              int jnkm  = (j - n) * (j - n) + j - n + k - m;
+              int jnkms = (j - n) * (j - n + 1) / 2 - k + m;
+              int nm    = n * n + n + m;
               M += std::conj(Cj->M[jnkms]) * Ynm[nm]
                  * real(ODDEVEN(k+n+m) * Anm[nm] * Anm[jnkm] / Anm[jk]);
             }
 
 template<>
 void Kernel<Laplace>::M2L(C_iter Ci, C_iter Cj) const {
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   vect dist = Ci->X - Cj->X - Xperiodic;
   real rho, alpha, beta;
   cart2sph(rho,alpha,beta,dist);
   evalLocal(rho,alpha,beta,Ynm,YnmTheta);
   for( int j=0; j!=P; ++j ) {
     for( int k=0; k<=j; ++k ) {
-      const int jk = j * j + j + k;
-      const int jks = j * (j + 1) / 2 + k;
+      int jk = j * j + j + k;
+      int jks = j * (j + 1) / 2 + k;
       complex L = 0;
-      for( int n=0; n!=P; ++n ) {
+      for( int n=0; n!=P-j; ++n ) {
         for( int m=-n; m<0; ++m ) {
-          const int nm   = n * n + n + m;
-          const int nms  = n * (n + 1) / 2 - m;
-          const int jknm = jk * P * P + nm;
-          const int jnkm = (j + n) * (j + n) + j + n + m - k;
+          int nm   = n * n + n + m;
+          int nms  = n * (n + 1) / 2 - m;
+          int jknm = jk * P * P + nm;
+          int jnkm = (j + n) * (j + n) + j + n + m - k;
           L += std::conj(Cj->M[nms]) * Cnm[jknm] * Ynm[jnkm];
         }
         for( int m=0; m<=n; ++m ) {
-          const int nm   = n * n + n + m;
-          const int nms  = n * (n + 1) / 2 + m;
-          const int jknm = jk * P * P + nm;
-          const int jnkm = (j + n) * (j + n) + j + n + m - k;
+          int nm   = n * n + n + m;
+          int nms  = n * (n + 1) / 2 + m;
+          int jknm = jk * P * P + nm;
+          int jnkm = (j + n) * (j + n) + j + n + m - k;
           L += Cj->M[nms] * Cnm[jknm] * Ynm[jnkm];
         }
       }
 template<>
 void Kernel<Laplace>::M2P(C_iter Ci, C_iter Cj) const {
   const complex I(0.,1.);                                       // Imaginary unit
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Cj->X - Xperiodic;
     vect spherical = 0;
 template<>
 void Kernel<Laplace>::L2L(C_iter Ci) const {
   const complex I(0.,1.);
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   C_iter Cj = Ci0 + Ci->PARENT;
   vect dist = Ci->X - Cj->X;
   real rho, alpha, beta;
   evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
   for( int j=0; j!=P; ++j ) {
     for( int k=0; k<=j; ++k ) {
-      const int jk = j * j + j + k;
-      const int jks = j * (j + 1) / 2 + k;
+      int jk = j * j + j + k;
+      int jks = j * (j + 1) / 2 + k;
       complex L = 0;
       for( int n=j; n!=P; ++n ) {
         for( int m=j+k-n; m<0; ++m ) {
-          const int jnkm = (n - j) * (n - j) + n - j + m - k;
-          const int nm   = n * n + n - m;
-          const int nms  = n * (n + 1) / 2 - m;
+          int jnkm = (n - j) * (n - j) + n - j + m - k;
+          int nm   = n * n + n - m;
+          int nms  = n * (n + 1) / 2 - m;
           L += std::conj(Cj->L[nms]) * Ynm[jnkm]
              * real(ODDEVEN(k) * Anm[jnkm] * Anm[jk] / Anm[nm]);
         }
         for( int m=0; m<=n; ++m ) {
           if( n-j >= abs(m-k) ) {
-            const int jnkm = (n - j) * (n - j) + n - j + m - k;
-            const int nm   = n * n + n + m;
-            const int nms  = n * (n + 1) / 2 + m;
+            int jnkm = (n - j) * (n - j) + n - j + m - k;
+            int nm   = n * n + n + m;
+            int nms  = n * (n + 1) / 2 + m;
             L += Cj->L[nms] * std::pow(I,real(m-k-abs(m-k)))
                * Ynm[jnkm] * Anm[jnkm] * Anm[jk] / Anm[nm];
           }
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
   const complex I(0.,1.);                                       // Imaginary unit
-  complex Ynm[4*P*P], YnmTheta[4*P*P];
+  complex Ynm[P*P], YnmTheta[P*P];
   for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
     vect dist = B->X - Ci->X;
     vect spherical = 0;

File kernel/GPUSphericalLaplace.cu

   gpureal x = cosf(alpha);                                      // x = cos(alpha)
   gpureal y = sinf(alpha);                                      // y = sin(alpha)
   gpureal rho_1 = 1 / rho;                                      // 1 / rho
-  for( int l=threadIdx.x; l<(2*P+1)*P; l+=THREADS ) {           // Loop over coefficients in Ynm
+  for( int l=threadIdx.x; l<(P+1)*P/2; l+=THREADS ) {           // Loop over coefficients in Ynm
     gpureal fact = 1;                                           //  Initialize 2 * m + 1
     gpureal pn = 1;                                             //  Initialize Legendre polynomial Pn
     gpureal rhom = rho_1;                                       //  Initialize rho^(-m-1)
   k = threadIdx.x - k;
   if( threadIdx.x >= NTERM ) j = k = 0;
   gpureal ajk = ODDEVEN(j) * rsqrtf(factShrd[j-k] * factShrd[j+k]);
-  for( int n=0; n<P; ++n ) {
+  for( int n=0; n<P-j; ++n ) {
     for( int m=-n; m<0; ++m ) {
       int jnkm = (j + n) * (j + n + 1) / 2 - m + k;
       gpureal ere = cosf((m - k) * beta);
   gpureal target[2] = {0, 0};
   __shared__ gpureal sourceShrd[2*THREADS];
   __shared__ gpureal factShrd[2*P];
-  __shared__ gpureal YnmShrd[4*NTERM];
+  __shared__ gpureal YnmShrd[NTERM];
   gpureal fact = EPS;
   for( int i=0; i<2*P; ++i ) {
     factShrd[i] = fact;