Commits

Rio Yokota committed c44f78f

sm_21 -> sm_13 much faster.

Comments (0)

Files changed (4)

 #CXX	= mpiFCCpx -Kfast,openmp
 ### CUDA compiler
 NVCC    = nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3\
-	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include
+	 -use_fast_math -arch=sm_13 -I../include -I$(CUDA_INSTALL_PATH)/include
 
 ### Base flags
 LFLAGS  = -D$(DEVICE) -D$(EXPAND)
 #endif
 #endif
 
-const int  P        = 8;                                        //!< Order of expansions
-const int  NCRIT    = 64;                                       //!< Number of bodies per cell
+const int  P        = 10;                                       //!< Order of expansions
+const int  NCRIT    = 1024;                                     //!< Number of bodies per cell
 const int  MAXBODY  = 50000;                                    //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel
 const real CLET     = 2;                                        //!< LET opening critetia

kernel/GPUSphericalLaplace.cu

     cart2sph(rho,alpha,beta,d.x,d.y,d.z);
     evalMultipole(YnmShrd,rho,alpha,factShrd);
     LaplaceM2M_core(target,beta,factShrd,YnmShrd,sourceShrd);
-    if(d.x*d.x+d.y*d.y+d.z*d.z<EPS&&threadIdx.x==0) printf("#FMM output: %f\n",target[0]);
+    //if(d.x*d.x+d.y*d.y+d.z*d.z<EPS&&threadIdx.x==0) printf("#FMM output: %f\n",target[0]);
   }
   itarget = blockIdx.x * THREADS + threadIdx.x;
   targetGlob[2*itarget+0] = target[0];

unit_test/serialrun.cxx

 #endif
 
 int main() {
-  const int numBodies = 100000;                                 // Number of bodies
+  const int numBodies = 1000000;                                // Number of bodies
   const int numTarget = 100;                                    // Number of target points to be used for error eval
   IMAGES = 0;                                                   // Level of periodic image tree (0 for non-periodic)
   THETA = 0.5;                                                  // Multipole acceptance criteria