Commits

Rio Yokota committed bd684c3

Queue wasn't being used in GPU evaluator. Removed cutil dependence.

Comments (0)

Files changed (6)

 VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
 
 ### choose CPU or GPU
-DEVICE  = CPU
-#DEVICE  = GPU
+#DEVICE  = CPU
+DEVICE  = GPU
 
 ### choose Cartesian or spherical expansion
 #EXPAND  = Cartesian
 # -Kdalign,ns,mfunc,eval,prefetch_conditional,ilfunc -x-
 
 ### CUDA compiler
-NVCC    = nvcc --compiler-bindir=/usr/bin/g++-4.4 -Xcompiler -fopenmp --ptxas-options=-v -O3\
-	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_SDK_PATH)/common/inc
+NVCC    = nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3\
+	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include
 
 ### Base flags
 LFLAGS  = -D$(DEVICE) -D$(EXPAND)
 
 #ifeq ($(DEVICE),GPU)
 ### CUDA flags
-LFLAGS  += -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart -lstdc++ -ldl -lm
+LFLAGS  += -DQUEUE -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart -lstdc++ -ldl -lm
 #endif
 
 OBJECT = ../kernel/$(DEVICE)$(EXPAND)Laplace.o ../kernel/$(DEVICE)VanDerWaals.o\
 #include <utility>
 #include <vector>
 #include "vec.h"                                                //!< My vector type with operator overloading
-#include <xmmintrin.h>
+//#include <xmmintrin.h>
 #if PAPI
 #include <papi.h>
 #endif

kernel/GPUEvaluator.cxx

 
 template<Equation equation>
 void Evaluator<equation>::evalM2L(C_iter Ci, C_iter Cj) {       // Queue single M2L kernel
+  if( Ci-Ci0 == 579 && Cj-Cj0 == 512 ) std::cout << listM2L[579].size() << std::endl;
   listM2L[Ci-Ci0].push_back(Cj);                                // Push source cell into M2L interaction list
   flagM2L[Ci-Ci0][Cj] |= Iperiodic;                             // Flip bit of periodic image flag
   NM2L++;                                                       // Count M2L kernel execution

kernel/GPUEwaldLaplace.cu

   if( numBlocks != 0 ) {\
     LaplaceEwaldReal_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);\
   }\
-  CUT_CHECK_ERROR("Kernel execution failed");\
   cudaThreadSynchronize();\
   stopTimer("EwaldReal GPU");\
 }

kernel/GPUSphericalLaplace.cu

 #define KERNEL
 #include "kernel.h"
 #undef KERNEL
-#include <cutil.h>
 
 __device__ __constant__ gpureal constDevc[514];                 // Constants on device
 
   cudaThreadSynchronize();
   startTimer("cudaMalloc");
   if( keysHost.size() > keysDevcSize ) {
-    if( keysDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(keysDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &keysDevc, keysHost.size()*sizeof(int) ));
+    if( keysDevcSize != 0 ) cudaFree(keysDevc);
+    cudaMalloc( (void**) &keysDevc, keysHost.size()*sizeof(int) );
     keysDevcSize = keysHost.size();
   }
   if( rangeHost.size() > rangeDevcSize ) {
-    if( rangeDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(rangeDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &rangeDevc, rangeHost.size()*sizeof(int) ));
+    if( rangeDevcSize != 0 ) cudaFree(rangeDevc);
+    cudaMalloc( (void**) &rangeDevc, rangeHost.size()*sizeof(int) );
     rangeDevcSize = rangeHost.size();
   }
   if( sourceHost.size() > sourceDevcSize ) {
-    if( sourceDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(sourceDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &sourceDevc, sourceHost.size()*sizeof(gpureal) ));
+    if( sourceDevcSize != 0 ) cudaFree(sourceDevc);
+    cudaMalloc( (void**) &sourceDevc, sourceHost.size()*sizeof(gpureal) );
     sourceDevcSize = sourceHost.size();
   }
   if( targetHost.size() > targetDevcSize ) {
-    if( targetDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(targetDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &targetDevc, targetHost.size()*sizeof(gpureal) ));
+    if( targetDevcSize != 0 ) cudaFree(targetDevc);
+    cudaMalloc( (void**) &targetDevc, targetHost.size()*sizeof(gpureal) );
     targetDevcSize = targetHost.size();
   }
   cudaThreadSynchronize();
 void Kernel<Laplace>::hostToDevice() {
   cudaThreadSynchronize();
   startTimer("cudaMemcpy");
-  CUDA_SAFE_CALL(cudaMemcpy(keysDevc,  &keysHost[0],  keysHost.size()*sizeof(int),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(rangeDevc, &rangeHost[0], rangeHost.size()*sizeof(int),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(sourceDevc,&sourceHost[0],sourceHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(targetDevc,&targetHost[0],targetHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpyToSymbol(constDevc,&constHost[0],constHost.size()*sizeof(gpureal)));
+  cudaMemcpy(keysDevc,  &keysHost[0],  keysHost.size()*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(rangeDevc, &rangeHost[0], rangeHost.size()*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(sourceDevc,&sourceHost[0],sourceHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice);
+  cudaMemcpy(targetDevc,&targetHost[0],targetHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(constDevc,&constHost[0],constHost.size()*sizeof(gpureal));
   cudaThreadSynchronize();
   stopTimer("cudaMemcpy");
 }
 void Kernel<Laplace>::deviceToHost() {
   cudaThreadSynchronize();
   startTimer("cudaMemcpy");
-  CUDA_SAFE_CALL(cudaMemcpy(&targetHost[0],targetDevc,targetHost.size()*sizeof(gpureal),cudaMemcpyDeviceToHost));
+  cudaMemcpy(&targetHost[0],targetDevc,targetHost.size()*sizeof(gpureal),cudaMemcpyDeviceToHost);
   cudaThreadSynchronize();
   stopTimer("cudaMemcpy");
 }
   if( numBlocks != 0 ) {
     LaplaceP2M_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("P2M GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceM2M_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("M2M GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceM2L_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("M2L GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceM2P_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("M2P GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceP2P_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("P2P GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceL2L_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("L2L GPUkernel");
 }
   if( numBlocks != 0 ) {
     LaplaceL2P_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("L2P GPUkernel");
 }

kernel/GPUVanDerWaals.cu

 #define KERNEL
 #include "kernel.h"
 #undef KERNEL
-#include <cutil.h>
 __device__ __constant__ gpureal constDevc[514];                 // Constants on device
 
 template<>
   cudaThreadSynchronize();
   startTimer("cudaMalloc");
   if( keysHost.size() > keysDevcSize ) {
-    if( keysDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(keysDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &keysDevc, keysHost.size()*sizeof(int) ));
+    if( keysDevcSize != 0 ) cudaFree(keysDevc);
+    cudaMalloc( (void**) &keysDevc, keysHost.size()*sizeof(int) );
     keysDevcSize = keysHost.size();
   }
   if( rangeHost.size() > rangeDevcSize ) {
-    if( rangeDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(rangeDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &rangeDevc, rangeHost.size()*sizeof(int) ));
+    if( rangeDevcSize != 0 ) cudaFree(rangeDevc);
+    cudaMalloc( (void**) &rangeDevc, rangeHost.size()*sizeof(int) );
     rangeDevcSize = rangeHost.size();
   }
   if( sourceHost.size() > sourceDevcSize ) {
-    if( sourceDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(sourceDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &sourceDevc, sourceHost.size()*sizeof(gpureal) ));
+    if( sourceDevcSize != 0 ) cudaFree(sourceDevc);
+    cudaMalloc( (void**) &sourceDevc, sourceHost.size()*sizeof(gpureal) );
     sourceDevcSize = sourceHost.size();
   }
   if( targetHost.size() > targetDevcSize ) {
-    if( targetDevcSize != 0 ) CUDA_SAFE_CALL(cudaFree(targetDevc));
-    CUDA_SAFE_CALL(cudaMalloc( (void**) &targetDevc, targetHost.size()*sizeof(gpureal) ));
+    if( targetDevcSize != 0 ) cudaFree(targetDevc);
+    cudaMalloc( (void**) &targetDevc, targetHost.size()*sizeof(gpureal) );
     targetDevcSize = targetHost.size();
   }
   cudaThreadSynchronize();
     constHost.push_back(GSCALE[i]);
   }
   assert( constHost.size() == 514 );
-  CUDA_SAFE_CALL(cudaMemcpy(keysDevc,  &keysHost[0],  keysHost.size()*sizeof(int),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(rangeDevc, &rangeHost[0], rangeHost.size()*sizeof(int),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(sourceDevc,&sourceHost[0],sourceHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpy(targetDevc,&targetHost[0],targetHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice));
-  CUDA_SAFE_CALL(cudaMemcpyToSymbol(constDevc,&constHost[0],constHost.size()*sizeof(gpureal)));
+  cudaMemcpy(keysDevc,  &keysHost[0],  keysHost.size()*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(rangeDevc, &rangeHost[0], rangeHost.size()*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(sourceDevc,&sourceHost[0],sourceHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice);
+  cudaMemcpy(targetDevc,&targetHost[0],targetHost.size()*sizeof(gpureal),cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(constDevc,&constHost[0],constHost.size()*sizeof(gpureal));
   cudaThreadSynchronize();
   stopTimer("cudaMemcpy");
 }
 void Kernel<VanDerWaals>::deviceToHost() {
   cudaThreadSynchronize();
   startTimer("cudaMemcpy");
-  CUDA_SAFE_CALL(cudaMemcpy(&targetHost[0],targetDevc,targetHost.size()*sizeof(gpureal),cudaMemcpyDeviceToHost));
+  cudaMemcpy(&targetHost[0],targetDevc,targetHost.size()*sizeof(gpureal),cudaMemcpyDeviceToHost);
   cudaThreadSynchronize();
   stopTimer("cudaMemcpy");
 }
   if( numBlocks != 0 ) {
     VanDerWaalsP2P_GPU<<< numBlocks, THREADS >>>(keysDevc,rangeDevc,targetDevc,sourceDevc);
   }
-  CUT_CHECK_ERROR("Kernel execution failed");
   cudaThreadSynchronize();
   stopTimer("P2P GPUkernel");
 }
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.