Commits

Rio Yokota committed 6e3bc82

Few minor fixes.

  • Participants
  • Parent commits 442b9cc

Comments (0)

Files changed (6)

File Makefile.include

 EXAFMM_INCLUDE_PATH = ../include
 EXAFMM_LIBRARY_PATH = ../lib
 
-### CUDA path
-CUDA_INSTALL_PATH = /usr/local/cuda
-CUDA_SDK_PATH = /usr/local/cuda_sdk/C
-
-### VTK path
-VTK_INCLUDE_PATH = /usr/include/vtk-5.8
-VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
-
 ### choose kernel
 EQUATION = Laplace
 #EQUATION = Yukawa
 #CXX	= mpicxx -Wall -xHOST -O3 -funroll-loops -finline-functions -ansi-alias
 ### BG/P compiler
 #CXX	= mpixlcxx_r -qarch=450 -qtune=450 -O3
-### TAU compiler
-#CXX	= tau_cxx.sh -ggdb3 -Wall -Wextra -Wshadow -Wuninitialized -O3 -msse4.2 -ffast-math -funroll-loops -fforce-addr -fbounds-check
-
-### CUDA compiler
-NVCC	= nvcc --compiler-bindir=/usr/bin/g++-4.4 -Xcompiler -fopenmp --ptxas-options=-v\
-	-O3 -use_fast_math -arch=sm_21\
-	-I$(EXAFMM_INCLUDE_PATH) -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_SDK_PATH)/common/inc
 
 ### Base flags
 CXX	+= -I$(EXAFMM_INCLUDE_PATH)
 #LFLAGS	+= -std=c++0x -DTBB -ltbb
 
 ### MassiveThreads flags (doesn't work with OpenMP) : MassiveThreads is available from http://code.google.com/p/massivethreads/
-#LFLAGS	+= -std=c++0x -DMTHREAD -lmyth
+LFLAGS	+= -std=c++0x -DMTHREAD -lmyth
 
 ### PAPI flags
 #LFLAGS	+= -DPAPI -lpapi
 
 ### VTK flags : VTK is available from http://www.vtk.org/VTK/resources/software.html
+### VTK path
+#VTK_INCLUDE_PATH = /usr/include/vtk-5.8
+#VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
 #CXX	+= -I$(VTK_INCLUDE_PATH)
-#VFLAGS	= -L$(VTK_LIBRARY_PATH) -DVTK -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO
+#LFLAGS	= -L$(VTK_LIBRARY_PATH) -DVTK -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO
 
 ifeq ($(DEVICE),GPU)
+### CUDA path
+CUDA_INSTALL_PATH = /usr/local/cuda
+CUDA_SDK_PATH = /usr/local/cuda_sdk/C
+### CUDA compiler
+NVCC	= nvcc --compiler-bindir=/usr/bin/g++-4.4 -Xcompiler -fopenmp --ptxas-options=-v\
+	-O3 -use_fast_math -arch=sm_21\
+	-I$(EXAFMM_INCLUDE_PATH) -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_SDK_PATH)/common/inc
 ### CUDA flags
 LFLAGS  += -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_SDK_PATH)/lib -lcuda -lcudart -lcutil_x86_64 -lstdc++ -ldl -lm
 endif

File examples/Makefile

 #LFLAGS	+= -DMANY
 
 serial	: serial.cxx $(KERNELS)
-	$(CXX) $? $(LFLAGS) $(VFLAGS)
+	$(CXX) $? $(LFLAGS)
 	./a.out
 
 parallel: parallel.cxx $(KERNELS)
-	$(CXX) $? $(LFLAGS) $(VFLAGS)
+	$(CXX) $? $(LFLAGS)
 	mpirun -np 32 ./a.out
 
 wrapper: wrapper.cxx $(KERNELS)

File examples/wrapper.cxx

   MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
   const int N = 1000000;
   const double size = 2 * M_PI;
-  double *xi     = new double [3*N];
-  double *qi     = new double [N];
-  double *pi     = new double [N];
-  double *fi     = new double [3*N];
-  double *pd     = new double [N];
-  double *fd     = new double [3*N];
-  double *xj     = new double [3*N];
-  double *qj     = new double [N];
+  double *xi = new double [3*N];
+  double *qi = new double [N];
+  double *pi = new double [N];
+  double *fi = new double [3*N];
+  double *pd = new double [N];
+  double *fd = new double [3*N];
+  double *xj = new double [3*N];
+  double *qj = new double [N];
 
   srand48(mpirank);
   for( int i=0; i!=N; ++i ) {

File include/macros.h

 #define assert(x)
 #endif
 
+// SIMD instruction
+#if __AVX__
+#include <immintrin.h>
+#elif __SSE4_2__
+#include <nmmintrin.h>
+#elif __SSE4_1__
+#include <smmintrin.h>
+#elif __SSSE3__
+#include <tmmintrin.h>
+#elif __SSE3__
+#include <pmmintrin.h>
+#elif __SSE2__
+#include <emmintrin.h>
+#elif __SSE__
+#include <xmmintrin.h>
 #endif
+
+#endif

File include/serialfmm.h

 #endif
     real_t x = 1.0 / THETA;                                     // Inverse of theta
 #if ERROR_OPT
-    real_t a = c * pow(std::abs(C->M[0]),1.0/3);                // Cell coefficient
+    real_t a = c * powf(std::abs(C->M[0]),1.0/3);               // Cell coefficient
     for (int i=0; i<5; i++) {                                   // Newton-Rhapson iteration
       real_t f = x * x - 2 * x + 1 - a * pow(x,-P);             //  Function value
       real_t df = (P + 2) * x - 2 * (P + 1) + P / x;            //  Function derivative value
     Ci0 = cells.begin();                                        // Set iterator of target root cell
     Cj0 = cells.begin();                                        // Set iterator of source root cell
     upwardRecursion(Ci0, Ci0);                                  // Recursive call for upward pass
-    real_t c = (1 - THETA) * (1 - THETA) / pow(THETA,P+2) / pow(std::abs(Ci0->M[0]),1.0/3); // Root coefficient
+    real_t c = (1 - THETA) * (1 - THETA) / pow(THETA,P+2) / powf(std::abs(Ci0->M[0]),1.0/3); // Root coefficient
     setRcrit(Ci0, Ci0, c);                                      // Error optimization of Rcrit
     for (C_iter C=cells.begin(); C!=cells.begin()+9; C++) {     // Loop over top 2 levels of cells
       C->RCRIT *= 10;                                           //  Prevent approximation

File kernels/LaplaceCartesianCPU.cxx

-#include <immintrin.h>
-#define KERNEL
 #include "kernel.h"
-#undef KERNEL
 
 template<typename T, int nx, int ny, int nz>
 struct Index {