Commits

Rio Yokota committed f5d1cb3

Reverting back to old exafmm. Forked new version to exafmm-dev.

  • Participants
  • Parent commits d649325

Comments (0)

Files changed (158)

 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 # All options are set here
-SET(EQUATION Laplace) # {Laplace, Yukawa, Helmholtz, Stokes}
-SET(BASIS Cartesian) # {Cartesian, Spherical, Rotation, Planewave}
+OPTION(USE_SPHERICAL "Use Spherical Harmonics Expansions" ON)
 OPTION(USE_MPI "Use MPI" ON)
+OPTION(USE_OPENMP "Use OpenMP" ON)
 OPTION(USE_GPU "Use GPUs" OFF)
 OPTION(USE_VTK "Use VTK" OFF)
 
+# FMM expansion coordinate system
+IF(USE_SPHERICAL)
+  SET(EXPAND Spherical)
+ELSE()
+  SET(EXPAND Cartesian)
+ENDIF()
+
 # MPI
 IF(USE_MPI)
   FIND_PACKAGE(MPI REQUIRED)
   SET(CMAKE_CXX_COMPILER ${MPI_COMPILER})
 ENDIF()
 
+# OpenMP
+IF(USE_OPENMP)
+  FIND_PACKAGE(OpenMP REQUIRED)
+  MESSAGE(STATUS "Enabling OpenMP")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ENDIF()
+
 # CUDA
 IF(USE_GPU)
   SET(DEVICE GPU)
 
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wshadow -Wuninitialized -O3")
 INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include)
-ADD_DEFINITIONS(-D${EQUATION})
-ADD_DEFINITIONS(-D${BASIS})
 ADD_DEFINITIONS(-D${DEVICE})
-ADD_SUBDIRECTORY(kernels)
+ADD_DEFINITIONS(-D${EXPAND})
+ADD_SUBDIRECTORY(kernel)
 
 # Unit tests
 INCLUDE(CTest)
-ADD_SUBDIRECTORY(examples)
+ADD_SUBDIRECTORY(unit_test)
 .SUFFIXES: .cxx .cu .o
 .PHONY: docs
-## ExaFMM path
-EXAFMM_INCLUDE_PATH = ../include
-EXAFMM_LIBRARY_PATH = ../lib
 
-### choose kernel
-EQUATION = Laplace
-#EQUATION = Yukawa
-#EQUATION = Helmholtz
-#EQUATION = Stokes
-
-### choose basis of multipole/local expansion
-BASIS	= Cartesian
-#BASIS	= Spherical
-#BASIS	= Planewave
-
-### choose device to use
-DEVICE	= CPU
-#DEVICE	= GPU
-#DEVICE	= MIC
-
-### GCC compiler
-CXX	= mpicxx -ggdb3 -Wall -Wextra -Wshadow -Wuninitialized -O3 -msse4.2 -ffast-math -funroll-loops -fforce-addr -fbounds-check
-### Intel compiler
-#CXX	= mpicxx -Wall -xHOST -O3 -funroll-loops -finline-functions -ansi-alias
-### BG/P compiler
-#CXX	= mpixlcxx_r -qarch=450 -qtune=450 -O3
-
-### Base flags
-CXX	+= -I$(EXAFMM_INCLUDE_PATH)
-LFLAGS	= -L$(EXAFMM_LIBRARY_PATH)
-LFLAGS	+= -DCOMcenter # Use center of mass as center of expansion
-LFLAGS	+= -DCOMkernel # Use center of mass kernel that skips dipoles (Use with -DCOMcenter)
-LFLAGS	+= -DERROR_OPT # Use error optimized theta
-LFLAGS	+= -DUSE_BMAX # Use Bmax in multipole acceptance criteria
-LFLAGS	+= -DUSE_RMAX # Use Rmax in multipole acceptance criteria
-LFLAGS	+= -DDUAL # Use dual tree traversal (turn off every option above before turning this off)
-#LFLAGS	+= -DAUTO # Use autotuned selection between P2P or M2L (needs improvement)
-
-### Debugging flags
-LFLAGS += -DASSERT # Turns on asserttions (otherwise define an empty macro function)
-#LFLAGS	+= -DCOUNT # Count number calls to P2P and M2L
-
-### Intel TBB flags (doesn't work with OpenMP) : TBB is available from http://threadingbuildingblocks.org/download.php
-LFLAGS	+= -std=c++0x -DTBB -ltbb
-
-### MassiveThreads flags (doesn't work with OpenMP) : MassiveThreads is available from http://code.google.com/p/massivethreads/
-#LFLAGS	+= -std=c++0x -DMTHREAD -lmyth
-
-### PAPI flags
-#LFLAGS	+= -DPAPI -lpapi
-
-### VTK flags : VTK is available from http://www.vtk.org/VTK/resources/software.html
-### VTK path
-#VTK_INCLUDE_PATH = /usr/include/vtk-5.8
-#VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
-#CXX	+= -I$(VTK_INCLUDE_PATH)
-#LFLAGS	= -L$(VTK_LIBRARY_PATH) -DVTK -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO
-
-ifeq ($(DEVICE),GPU)
 ### CUDA path
 CUDA_INSTALL_PATH = /usr/local/cuda
 CUDA_SDK_PATH = /usr/local/cuda_sdk/C
+
+### VTK path
+VTK_INCLUDE_PATH = /usr/include/vtk-5.8
+VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
+
+### choose CPU or GPU
+DEVICE  = CPU
+#DEVICE  = GPU
+
+### choose Cartesian or spherical expansion
+#EXPAND  = Cartesian
+EXPAND  = Spherical
+
+### GCC compiler
+CXX	= mpicxx -ggdb3 -Wall -Wextra -Wshadow -Wuninitialized -O3 -ffast-math -fopenmp -funroll-loops -fforce-addr -fPIC -I../include
+
+### Intel compiler
+#CXX	= mpicxx -Wall -xHOST -O3 -openmp -funroll-loops -finline-functions -fPIC -ansi-alias -I../include
+
+### BG/P compiler
+#CXX	= mpixlcxx -qarch=450 -qtune=450 -I../include
+
+### K computer
+#CXX	= mpiFCCpx -Kopenmp,fast,ocl,preex,array_private -Ksimd=2,NOFLTLD,mfunc=3,optmsg=2 -Nsrc,sta -I../include -DNDEBUG
+# -Kdalign,ns,mfunc,eval,prefetch_conditional,ilfunc -x-
+
 ### CUDA compiler
-NVCC	= nvcc --compiler-bindir=/usr/bin/g++-4.4 -Xcompiler -fopenmp --ptxas-options=-v\
-	-O3 -use_fast_math -arch=sm_21\
-	-I$(EXAFMM_INCLUDE_PATH) -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_SDK_PATH)/common/inc
+NVCC    = nvcc --compiler-bindir=/usr/bin/g++-4.4 -Xcompiler -fopenmp --ptxas-options=-v -O3\
+	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include -I$(CUDA_SDK_PATH)/common/inc
+
+### Base flags
+LFLAGS  = -D$(DEVICE) -D$(EXPAND)
+
+### PAPI flags
+#LFLAGS += -DPAPI -lpapi
+
+### QUARK flags
+#LFLAGS	+= -DQUARK -lquark
+
+### MassiveThreads flags
+#LFLAGS += -std=c++0x -DMTHREADS -lmyth -lpthread -ldl
+
+### VTK flags
+#CXX     += -I$(VTK_INCLUDE_PATH)
+#VFLAGS  = -L$(VTK_LIBRARY_PATH) -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO -DVTK
+
+ifeq ($(DEVICE),GPU)
 ### CUDA flags
-LFLAGS  += -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_SDK_PATH)/lib -lcuda -lcudart -lcutil_x86_64 -lstdc++ -ldl -lm
+LFLAGS  += -DQUEUE -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_SDK_PATH)/lib -lcuda -lcudart -lcutil_x86_64 -lstdc++ -ldl -lm
 endif
 
-KERNELS	= ../kernels/$(EQUATION)$(BASIS)$(DEVICE).o
+OBJECT = ../kernel/$(DEVICE)$(EXPAND)Laplace.o ../kernel/$(DEVICE)VanDerWaals.o\
+	../kernel/CPUP2P.o
 
 .cxx.o  :
 	$(CXX) -c $? -o $@ $(LFLAGS)

examples/CMakeLists.txt

-ADD_EXECUTABLE(serial serial.cxx)
-TARGET_LINK_LIBRARIES(serial Kernels)
-ADD_TEST(serial ${CMAKE_CURRENT_BINARY_DIR}/serial)
-
-IF(USE_MPI)
-  ADD_EXECUTABLE(parallel parallel.cxx)
-  TARGET_LINK_LIBRARIES(parallel Kernels)
-  ADD_TEST(parallel ${MPIEXEC} -np 2 ${CMAKE_CURRENT_BINARY_DIR}/parallel)
-ENDIF()

examples/Makefile

-include ../Makefile.include
-
-#LFLAGS	+= -DMANY
-
-serial	: serial.cxx $(KERNELS)
-	$(CXX) $? $(LFLAGS)
-	./a.out
-
-parallel: parallel.cxx $(KERNELS)
-	$(CXX) $? $(LFLAGS)
-	mpirun -np 32 ./a.out
-
-wrapper: wrapper.cxx $(KERNELS)
-	make -C ../wrappers libcoulomb.a
-	$(CXX) $? $(LFLAGS) -lcoulomb
-	mpirun -np 4 ./a.out

examples/parallel.cxx

-#include "args.h"
-#include "dataset.h"
-#include "parallelfmm.h"
-#ifdef VTK
-#include "vtk.h"
-#endif
-
-int main(int argc, char ** argv) {
-  Args ARGS(argc, argv);
-  Bodies bodies, jbodies;
-  Cells cells, jcells;
-  Dataset DATA;
-  ParallelFMM FMM;
-  FMM.NCRIT = ARGS.NCRIT;
-  FMM.NSPAWN = ARGS.NSPAWN;
-  FMM.IMAGES = ARGS.IMAGES;
-  FMM.THETA = ARGS.THETA;
-  FMM.printNow = FMM.MPIRANK == 0;
-#if AUTO
-  FMM.timeKernels();
-#endif
-#ifdef MANY
-  for ( int it=0; it<25; it++ ) {
-    int numBodies = int(pow(10,(it+24)/8.0));
-#else
-  {
-    int numBodies = ARGS.numBodies / FMM.MPISIZE;
-#endif
-    if(FMM.printNow) std::cout << std::endl
-      << "N                    : " << numBodies << std::endl;
-    bodies.resize(numBodies);
-    DATA.initBodies(bodies, ARGS.distribution, FMM.MPIRANK, FMM.MPISIZE);
-    FMM.startTimer("FMM");
-
-    FMM.partition(bodies);
-    FMM.setBounds(bodies);
-    FMM.buildTree(bodies,cells);
-    FMM.upwardPass(cells);
-    FMM.startPAPI();
-
-#if 1 // Set to 0 for debugging by shifting bodies and reconstructing tree : Step 1
-    FMM.setLET(cells);
-    FMM.commBodies();
-    FMM.commCells();
-    FMM.evaluate(cells, cells);
-    jbodies = bodies;
-    for( int irank=1; irank<FMM.MPISIZE; irank++ ) {
-      FMM.getLET(jcells,(FMM.MPIRANK+irank)%FMM.MPISIZE);
-
-#if 0 // Set to 1 for debugging full LET communication : Step 2 (LET must be set to full tree)
-      FMM.shiftBodies(jbodies); // This will overwrite recvBodies. (define recvBodies2 in partition.h to avoid this)
-      Cells icells;
-      FMM.buildTree(jbodies, icells);
-      FMM.upwardPass(icells);
-      assert( icells.size() == jcells.size() );
-      CellQueue Qi, Qj;
-      Qi.push(icells.begin());
-      Qj.push(jcells.begin());
-      int ic=0;
-      while( !Qi.empty() ) {
-        C_iter Ci=Qi.front(); Qi.pop();
-        C_iter Cj=Qj.front(); Qj.pop();
-        if( Ci->ICELL != Cj->ICELL ) {
-          std::cout << FMM.MPIRANK << " ICELL  : " << Ci->ICELL << " " << Cj->ICELL << std::endl;
-          break;
-        }
-        if( Ci->NCHILD != Cj->NCHILD ) {
-          std::cout << FMM.MPIRANK << " NCHILD : " << Ci->NCHILD << " " << Cj->NCHILD << std::endl;
-          break;
-        }
-        if( Ci->NCBODY != Cj->NCBODY ) {
-          std::cout << FMM.MPIRANK << " NCBODY : " << Ci->NCBODY << " " << Cj->NCBODY << std::endl;
-          break;
-        }
-        real_t sumi = 0, sumj = 0;
-        if( Ci->NCBODY != 0 ) {
-          for( int i=0; i<Ci->NCBODY; i++ ) {
-            B_iter Bi = Ci->BODY+i;
-            B_iter Bj = Cj->BODY+i;
-            sumi += Bi->X[0];
-            sumj += Bj->X[0];
-          }
-        }
-        if( fabs(sumi-sumj)/fabs(sumi) > 1e-6 ) std::cout << FMM.MPIRANK << " " << Ci->ICELL << " " << sumi << " " << sumj << std::endl;
-        assert( fabs(sumi-sumj)/fabs(sumi) < 1e-6 );
-        for( int i=0; i<Ci->NCHILD; i++ ) Qi.push(icells.begin()+Ci->CHILD+i);
-        for( int i=0; i<Cj->NCHILD; i++ ) Qj.push(jcells.begin()+Cj->CHILD+i);
-        ic++;
-      }
-      assert( ic == int(icells.size()) );
-#endif
-      FMM.evaluate(cells, jcells);
-    }
-#else
-    jbodies = bodies;
-    for( int irank=0; irank!=FMM.MPISIZE; irank++ ) {
-      FMM.shiftBodies(jbodies);
-      jcells.clear();
-      FMM.setBounds(jbodies);
-      FMM.buildTree(jbodies, jcells);
-      FMM.upwardPass(jcells);
-      FMM.evaluate(cells, jcells);
-    }
-#endif
-
-    FMM.downwardPass(cells);
-
-    FMM.stopPAPI();
-    FMM.stopTimer("FMM",FMM.printNow);
-    FMM.eraseTimer("FMM");
-    FMM.writeTime();
-    FMM.resetTimer();
-
-    jbodies = bodies;
-    if (bodies.size() > 100) bodies.resize(100);
-    Bodies bodies2 = bodies;
-    DATA.initTarget(bodies2);
-    FMM.startTimer("Direct sum");
-    for( int i=0; i!=FMM.MPISIZE; ++i ) {
-      FMM.shiftBodies(jbodies);
-      FMM.direct(bodies2, jbodies);
-      if(FMM.printNow) std::cout << "Direct loop          : " << i+1 << "/" << FMM.MPISIZE << std::endl;
-    }
-    FMM.normalize(bodies2);
-    FMM.stopTimer("Direct sum",FMM.printNow);
-    FMM.eraseTimer("Direct sum");
-    double diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0, diff3 = 0, norm3 = 0, diff4 = 0, norm4 = 0;
-    DATA.evalError(bodies, bodies2, diff1, norm1, diff2, norm2);
-    MPI_Reduce(&diff1, &diff3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&norm1, &norm3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&diff2, &diff4, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&norm2, &norm4, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    if(FMM.printNow) DATA.printError(diff3, norm3, diff4, norm4);
-  }
-
-#ifdef VTK
-  for( B_iter B=jbodies.begin(); B!=jbodies.end(); ++B ) B->ICELL = 0;
-  for( C_iter C=jcells.begin(); C!=jcells.end(); ++C ) {
-    Body body;
-    body.ICELL = 1;
-    body.X     = C->X;
-    body.SRC   = 0;
-    jbodies.push_back(body);
-  }
-  int Ncell = 0;
-  vtkPlot vtk;
-  if( FMM.MPIRANK == 0 ) {
-    vtk.setDomain(M_PI,0);
-    vtk.setGroupOfPoints(jbodies,Ncell);
-  }
-  for( int i=1; i!=FMM.MPISIZE; ++i ) {
-    FMM.shiftBodies(jbodies);
-    if( FMM.MPIRANK == 0 ) {
-      vtk.setGroupOfPoints(jbodies,Ncell);
-    }
-  }
-  if( FMM.MPIRANK == 0 ) {
-    vtk.plot(Ncell);
-  }
-#endif
-}

examples/serial.cxx

-#include "args.h"
-#include "dataset.h"
-#include "serialfmm.h"
-#ifdef VTK
-#include "vtk.h"
-#endif
-
-int main(int argc, char ** argv) {
-  Args ARGS(argc, argv);
-  Bodies bodies, jbodies;
-  Cells cells, jcells;
-  Dataset DATA;
-  SerialFMM FMM;
-  FMM.NCRIT = ARGS.NCRIT;
-  FMM.NSPAWN = ARGS.NSPAWN;
-  FMM.IMAGES = ARGS.IMAGES;
-  FMM.THETA = ARGS.THETA;
-  FMM.printNow = true;
-#if AUTO
-  FMM.timeKernels();
-#endif
-#ifdef MANY
-  for( int it=0; it<25; it++ ) {
-    int numBodies = int(pow(10,(it+24)/8.0));
-#else
-  {
-    int numBodies = ARGS.numBodies;
-#endif // MANY
-    if(FMM.printNow) std::cout << std::endl
-      << "N                    : " << numBodies << std::endl;
-    bodies.resize(numBodies);
-    DATA.initBodies(bodies, ARGS.distribution);
-    FMM.setBounds(bodies);
-    FMM.buildTree(bodies, cells);                               // TODO : make it work without this
-    FMM.resetTimer();
-    FMM.startTimer("FMM");
-    FMM.buildTree(bodies, cells);
-    FMM.upwardPass(cells);
-    FMM.startPAPI();
-    FMM.evaluate(cells, cells, ARGS.mutual);
-    FMM.stopPAPI();
-    FMM.downwardPass(cells);
-    FMM.stopTimer("FMM", FMM.printNow);
-    FMM.eraseTimer("FMM");
-    FMM.writeTime();
-    FMM.resetTimer();
-    jbodies = bodies;
-    if (int(bodies.size()) > ARGS.numTarget) bodies.resize(ARGS.numTarget);
-    Bodies bodies2 = bodies;
-    DATA.initTarget(bodies2);
-    FMM.startTimer("Direct sum");
-    FMM.direct(bodies2, jbodies);
-    FMM.normalize(bodies2);
-    FMM.stopTimer("Direct sum", FMM.printNow);
-    FMM.eraseTimer("Direct sum");
-    double diff1 = 0, norm1 = 0, diff2 = 0, norm2 = 0;
-    DATA.evalError(bodies, bodies2, diff1, norm1, diff2, norm2);
-    if(FMM.printNow) DATA.printError(diff1, norm1, diff2, norm2);
-  }
-#ifdef VTK
-  for( B_iter B=jbodies.begin(); B!=jbodies.end(); ++B ) B->ICELL = 0;
-  for( C_iter C=jcells.begin(); C!=jcells.end(); ++C ) {
-    Body body;
-    body.ICELL = 1;
-    body.X     = C->X;
-    body.SRC   = 0;
-    jbodies.push_back(body);
-  }
-  int Ncell = 0;
-  vtkPlot vtk;
-  vtk.setDomain(M_PI,0);
-  vtk.setGroupOfPoints(jbodies, Ncell);
-  vtk.plot(Ncell);
-#endif
-}

examples/wrapper.cxx

-#include <mpi.h>
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-
-extern "C" void FMMcalccoulomb(int n, double* x, double* q, double *p, double* f, int periodicflag);
-
-extern "C" void MPI_Shift(double *var, int n, int mpisize, int mpirank) {
-  double *buf = new double [n];
-  const int isend = (mpirank + 1          ) % mpisize;
-  const int irecv = (mpirank - 1 + mpisize) % mpisize;
-  MPI_Request sreq, rreq;
-
-  MPI_Isend(var, n, MPI_DOUBLE, irecv, 1, MPI_COMM_WORLD, &sreq);
-  MPI_Irecv(buf, n, MPI_DOUBLE, isend, 1, MPI_COMM_WORLD, &rreq);
-  MPI_Wait(&sreq, MPI_STATUS_IGNORE);
-  MPI_Wait(&rreq, MPI_STATUS_IGNORE);
-  int i;
-  for( i=0; i!=n; ++i ) {
-    var[i] = buf[i];
-  }
-  delete[] buf;
-}
-
-int main(int argc, char **argv) {
-  MPI_Init(&argc,&argv);
-  int mpisize, mpirank;
-  MPI_Comm_size(MPI_COMM_WORLD, &mpisize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
-  const int N = 1000000;
-  const double size = 2 * M_PI;
-  double *xi = new double [3*N];
-  double *qi = new double [N];
-  double *pi = new double [N];
-  double *fi = new double [3*N];
-  double *pd = new double [N];
-  double *fd = new double [3*N];
-  double *xj = new double [3*N];
-  double *qj = new double [N];
-
-  srand48(mpirank);
-  for( int i=0; i!=N; ++i ) {
-    xi[3*i+0] = drand48() * size - M_PI;
-    xi[3*i+1] = drand48() * size - M_PI;
-    xi[3*i+2] = drand48() * size - M_PI;
-    qi[i] = 1. / N;
-    pi[i] = 0;
-    fi[3*i+0] = fi[3*i+1] = fi[3*i+2] = 0;
-    pd[i] = 0;
-    fd[3*i+0] = fd[3*i+1] = fd[3*i+2] = 0;
-    xj[3*i+0] = xi[3*i+0];
-    xj[3*i+1] = xi[3*i+1];
-    xj[3*i+2] = xi[3*i+2];
-    qj[i] = qi[i];
-  }
-
-  FMMcalccoulomb(N, xi, qi, pi, fi, 0);
-  for( int i=0; i!=N; ++i ) {
-    xj[3*i+0] = xi[3*i+0];
-    xj[3*i+1] = xi[3*i+1];
-    xj[3*i+2] = xi[3*i+2];
-  }
-  for( int irank=0; irank!=mpisize; ++irank ) {
-    MPI_Shift(xj, 3*N, mpisize, mpirank);
-    MPI_Shift(qj, N, mpisize, mpirank);
-    for( int i=0; i!=100; ++i ) {
-      double P = 0, Fx = 0, Fy = 0, Fz = 0;
-      for( int j=0; j!=N; ++j ) {
-        double dx = xi[3*i+0] - xj[3*j+0];
-        double dy = xi[3*i+1] - xj[3*j+1];
-        double dz = xi[3*i+2] - xj[3*j+2];
-        double R2 = dx * dx + dy * dy + dz * dz;
-        double invR = 1 / std::sqrt(R2);
-        if( irank == mpisize-1 && i == j ) invR = 0;
-        double invR3 = qj[j] * invR * invR * invR;
-        P += qj[j] * invR;
-        Fx += dx * invR3;
-        Fy += dy * invR3;
-        Fz += dz * invR3;
-      }
-      pd[i] += P;
-      fd[3*i+0] += Fx;
-      fd[3*i+1] += Fy;
-      fd[3*i+2] += Fz;
-    }
-  }
-  double Pd = 0, Pn = 0, Fd = 0, Fn = 0;
-  for( int i=0; i!=100; ++i ) {
-    Pd += (pi[i] - pd[i]) * (pi[i] - pd[i]);
-    Pn += pd[i] * pd[i];
-    Fd += (fi[3*i+0] - fd[3*i+0]) * (fi[3*i+0] - fd[3*i+0])
-        + (fi[3*i+1] - fd[3*i+1]) * (fi[3*i+1] - fd[3*i+1])
-        + (fi[3*i+2] - fd[3*i+2]) * (fi[3*i+2] - fd[3*i+2]);
-    Fn += fd[3*i+0] * fd[3*i+0] + fd[3*i+1] * fd[3*i+1] + fd[3*i+2] * fd[3*i+2];
-  }
-  std::cout << "Coulomb       potential @ rank " << mpirank << " : " << sqrtf(Pd/Pn) << std::endl;
-  std::cout << "Coulomb       force     @ rank " << mpirank << " : " << sqrtf(Fd/Fn) << std::endl;
-
-  delete[] xi;
-  delete[] qi;
-  delete[] pi;
-  delete[] fi;
-  delete[] pd;
-  delete[] fd;
-  delete[] xj;
-  delete[] qj;
-
-  MPI_Finalize();
-}

gpu/Makefile

-CUDA_INSTALL_PATH  = /usr/local/cuda
-
-CXX = g++ -fopenmp -O3
-NVCC = nvcc --compiler-bindir=/usr/bin/g++-4.4 -arch sm_21 -use_fast_math -Iinclude -Xcompiler "-fopenmp -O3"
-
-ORG = approximate.o \
-	main.o \
-	scanKernels.o \
-	tree.o
-TORG = sortKernels.th_o
-
-.SUFFIXES: .o .cpp .cu
-
-all: $(ORG) $(TORG)
-	$(CXX) $^ -lcudart -L$(CUDA_INSTALL_PATH)/lib64
-	./a.out
-
-%.o: %.cu
-	$(NVCC) -c $< -o $@
-
-%.th_o: %.cu
-	$(NVCC) -c $< -o $@
-
-$(ORG): include/*.h
-
-clean:
-	find . -name "*.o" -o -name "*.out*" | xargs rm -rf
-cleanall:
-	make clean
-	rm *.th_o
-commit:
-	@make -C .. commit
-save:
-	@make -C .. save
-revert:
-	@make -C .. revert

gpu/approximate.cu

-#include "octree.h"
-#define laneId (threadIdx.x & (WARP_SIZE - 1))
-#define warpId (threadIdx.x >> WARP_SIZE2)
-#define IF(x) (-(int)(x))
-#define ABS(x) ((int(x) < 0 ) ? -(x) : (x))
-
-__device__ __forceinline__ int inclusiveScanInt(int* prefix, int value) 
-{
-  prefix[laneId] = value;
-  for (int i = 0; i < WARP_SIZE2; i++) {
-    const int offset = 1 << i;
-    const int laneOffset = ABS(laneId-offset);
-    prefix[laneId] += prefix[laneOffset] & IF(laneId >= offset);
-  }
-  return prefix[WARP_SIZE-1];
-}
-
-__device__ __forceinline__ int lanemask_lt()
-{
-  int mask;
-  asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
-  return mask;
-}
-
-__device__ int exclusiveScanBit(const bool flag)
-{
-  const uint flags = __ballot(flag);
-  return __popc(flags & lanemask_lt());
-}
-
-__device__ int reduceBit(const bool flag)
-{
-  const uint flags = __ballot(flag);
-  return __popc(flags);
-}
-
-__device__ __forceinline__ int lanemask_le()
-{
-  int mask;
-  asm("mov.u32 %0, %lanemask_le;" : "=r" (mask));
-  return mask;
-}
-
-__device__ __forceinline__ int inclusive_segscan_warp(
-    int *shmem, const int packed_value, int &dist_block, int &nseg)
-{
-  const int  flag = packed_value < 0;
-  const int  mask = IF(flag);
-  const int value = (mask & (-1-packed_value)) + (~mask & 1);
-  const int flags = __ballot(flag);
-
-  nseg += __popc(flags) ;
-  dist_block = __clz(__brev(flags));
-
-  const int distance = min(__clz(flags & lanemask_le()) + laneId - 31, laneId);
-  shmem[laneId] = value;
-  for( int i=0; i<WARP_SIZE2; i++ ) {
-    const int offset = 1 << i;
-    const int laneOffset = ABS(laneId-offset);
-    shmem[laneId] += shmem[laneOffset] & IF(offset <= distance);
-  }
-  return shmem[WARP_SIZE - 1];
-}
-
-__device__ __forceinline__ int inclusive_segscan_array(int *shmem_in, const int N)
-{
-  int dist, nseg = 0;
-  int y = inclusive_segscan_warp(shmem_in, shmem_in[laneId], dist, nseg);
-  for( int p=WARP_SIZE; p<N; p+=WARP_SIZE ) {
-    int *shmem = shmem_in + p;
-    int y1 = inclusive_segscan_warp(shmem, shmem[laneId], dist, nseg);
-    shmem[laneId] += y & IF(laneId < dist);
-    y = y1;
-  }
-  return nseg;
-}
-
-__device__ __forceinline__ int ACCESS(const int i) {
-  return (i & (LMEM_STACK_SIZE - 1)) * blockDim.x + threadIdx.x;
-}
-
-texture<uint, 1, cudaReadModeElementType> texNodeChild;
-texture<float, 1, cudaReadModeElementType> texOpening;
-texture<float4, 1, cudaReadModeElementType> texMultipole;
-texture<float4, 1, cudaReadModeElementType> texBody;
-
-__device__ __forceinline__ void P2P(
-    float4 &acc,  const float4 pos,
-    const float4 posj) {
-  const float3 dr = make_float3(posj.x - pos.x, posj.y - pos.y, posj.z - pos.z);
-  const float r2     = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z + EPS2;
-  const float rinv   = rsqrtf(r2);
-  const float rinv2  = rinv*rinv;
-  const float mrinv  = posj.w * rinv;
-  const float mrinv3 = mrinv * rinv2;
-  acc.w -= mrinv;
-  acc.x += mrinv3 * dr.x;
-  acc.y += mrinv3 * dr.y;
-  acc.z += mrinv3 * dr.z;
-}
-
-__device__ bool applyMAC(
-    const float4 sourceCenter, 
-    const float4 groupCenter, 
-    const float4 groupSize) {
-  float3 dr = make_float3(fabsf(groupCenter.x - sourceCenter.x) - (groupSize.x),
-                          fabsf(groupCenter.y - sourceCenter.y) - (groupSize.y),
-                          fabsf(groupCenter.z - sourceCenter.z) - (groupSize.z));
-  dr.x += fabsf(dr.x); dr.x *= 0.5f;
-  dr.y += fabsf(dr.y); dr.y *= 0.5f;
-  dr.z += fabsf(dr.z); dr.z *= 0.5f;
-  const float ds2 = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z;
-  return ds2 <= fabsf(sourceCenter.w);
-}
-
-__device__ void traverse(
-    float4 &pos_i,
-    float4 &acc_i,
-    float4 targetCenter,
-    float4 targetSize,
-    uint2 rootRange,
-    int *shmem,
-    int *lmem) {
-  const int stackSize = LMEM_STACK_SIZE << NTHREAD2;
-  int *approxNodes = lmem + stackSize + 2 * WARP_SIZE * warpId;
-  int *numDirect = shmem;
-  int *stackShrd = numDirect + WARP_SIZE;
-  int *directNodes = stackShrd + WARP_SIZE;
-  float4 *pos_j = (float4*)&directNodes[3*WARP_SIZE];
-  int *prefix = (int*)&pos_j[WARP_SIZE];
-
-  // stack
-  int *stackGlob = lmem;
-  // begin tree-walk
-  int warpOffsetApprox = 0;
-  int warpOffsetDirect = 0;
-  for( int root=rootRange.x; root<rootRange.y; root+=WARP_SIZE ) {
-    int numNodes = min(rootRange.y-root, WARP_SIZE);
-    int beginStack = 0;
-    int endStack = 1;
-    stackGlob[threadIdx.x] = root + laneId;
-    // walk each level
-    while( numNodes > 0 ) {
-      int numNodesNew = 0;
-      int warpOffsetSplit = 0;
-      int numStack = endStack;
-      // walk a level
-      for( int iStack=beginStack; iStack<endStack; iStack++ ) {
-        bool valid = laneId < numNodes;
-        int node = stackGlob[ACCESS(iStack)] & IF(valid);
-        numNodes -= WARP_SIZE;
-        float opening = tex1Dfetch(texOpening, node);
-        uint sourceData = tex1Dfetch(texNodeChild, node);
-        float4 sourceCenter = tex1Dfetch(texMultipole, node);
-        sourceCenter.w = opening;
-        bool split = applyMAC(sourceCenter, targetCenter, targetSize);
-        bool leaf = opening <= 0;
-        bool flag = split && !leaf && valid;
-        int child = sourceData & 0x0FFFFFFF;
-        int numChild = ((sourceData & 0xF0000000) >> 28) & IF(flag);
-        int sumChild = inclusiveScanInt(prefix, numChild);
-        int laneOffset = prefix[laneId];
-        laneOffset += warpOffsetSplit - numChild;
-        for( int i=0; i<numChild; i++ )
-          stackShrd[laneOffset+i] = child+i;
-        warpOffsetSplit += sumChild;
-        while( warpOffsetSplit >= WARP_SIZE ) {
-          warpOffsetSplit -= WARP_SIZE;
-          stackGlob[ACCESS(numStack)] = stackShrd[warpOffsetSplit+laneId];
-          numStack++;
-          numNodesNew += WARP_SIZE;
-          if( (numStack - iStack) > LMEM_STACK_SIZE ) return;
-        }
-#if 1   // APPROX
-        flag = !split && valid;
-        laneOffset = exclusiveScanBit(flag);
-        if( flag ) approxNodes[warpOffsetApprox+laneOffset] = node;
-        warpOffsetApprox += reduceBit(flag);
-        if( warpOffsetApprox >= WARP_SIZE ) {
-          warpOffsetApprox -= WARP_SIZE;
-          node = approxNodes[warpOffsetApprox+laneId];
-          pos_j[laneId] = tex1Dfetch(texMultipole, node);
-          for( int i=0; i<WARP_SIZE; i++ )
-            P2P(acc_i, pos_i, pos_j[i]);
-        }
-#endif
-#if 1   // DIRECT
-        flag = split && leaf && valid;
-        const int jbody = sourceData & BODYMASK;
-        int numBodies = (((sourceData & INVBMASK) >> LEAFBIT)+1) & IF(flag);
-        directNodes[laneId] = numDirect[laneId];
-
-        int sumBodies = inclusiveScanInt(prefix, numBodies);
-        laneOffset = prefix[laneId];
-        if( flag ) prefix[exclusiveScanBit(flag)] = laneId;
-        numDirect[laneId] = laneOffset;
-        laneOffset -= numBodies;
-        int numFinished = 0;
-        while( sumBodies > 0 ) {
-          numBodies = min(sumBodies, 3*WARP_SIZE-warpOffsetDirect);
-          for( int i=warpOffsetDirect; i<warpOffsetDirect+numBodies; i+=WARP_SIZE )
-            directNodes[i+laneId] = 0;
-          if( flag && (numDirect[laneId] <= numBodies) && (laneOffset >= 0) )
-            directNodes[warpOffsetDirect+laneOffset] = -1-jbody;
-          numFinished += inclusive_segscan_array(&directNodes[warpOffsetDirect], numBodies);
-          numBodies = numDirect[prefix[numFinished-1]];
-          sumBodies -= numBodies;
-          numDirect[laneId] -= numBodies;
-          laneOffset -= numBodies;
-          warpOffsetDirect += numBodies;
-          while( warpOffsetDirect >= WARP_SIZE ) {
-            warpOffsetDirect -= WARP_SIZE;
-            pos_j[laneId] = tex1Dfetch(texBody,directNodes[warpOffsetDirect+laneId]);
-            for( int i=0; i<WARP_SIZE; i++ )
-              P2P(acc_i, pos_i, pos_j[i]);
-          }
-        }
-        numDirect[laneId] = directNodes[laneId];
-#endif
-      }
-
-      if( warpOffsetSplit > 0 ) { 
-        stackGlob[ACCESS(numStack)] = stackShrd[laneId];
-        numStack++; 
-        numNodesNew += warpOffsetSplit;
-      }
-      numNodes = numNodesNew;
-      beginStack = endStack;
-      endStack = numStack;
-    }
-  }
-
-  if( warpOffsetApprox > 0 ) {
-    if( laneId < warpOffsetApprox )  {
-      const int node = approxNodes[laneId];
-      pos_j[laneId] = tex1Dfetch(texMultipole, node);
-    } else {
-      pos_j[laneId] = make_float4(1.0e10f, 1.0e10f, 1.0e10f, 0.0f);
-    }
-    for( int i=0; i<WARP_SIZE; i++ )
-      P2P(acc_i, pos_i, pos_j[i]);
-  }
-
-  if( warpOffsetDirect > 0 ) {
-    if( laneId < warpOffsetDirect ) {
-      const float4 posj = tex1Dfetch(texBody,numDirect[laneId]);
-      pos_j[laneId] = posj;
-    } else {
-      pos_j[laneId] = make_float4(1.0e10f, 1.0e10f, 1.0e10f, 0.0f);
-    }
-    for( int i=0; i<WARP_SIZE; i++ ) 
-      P2P(acc_i, pos_i, pos_j[i]);
-  }
-}
-
-extern "C" __global__ void
-  traverseKernel(
-      const int numGroups,
-      uint2 *levelRange,
-      float4 *acc,
-      float4 *groupSizeInfo,
-      float4 *groupCenterInfo,
-      int    *MEM_BUF,
-      uint   *workToDo) {
-  __shared__ int wid[4];
-  __shared__ int shmem_pool[10*NTHREAD];
-  int *shmem = shmem_pool+10*WARP_SIZE*warpId;
-  int *lmem = &MEM_BUF[blockIdx.x*(LMEM_STACK_SIZE*NTHREAD+2*NTHREAD)];
-  while(true) {
-    if( laneId == 0 )
-      wid[warpId] = atomicAdd(workToDo,1);
-    if( wid[warpId] >= numGroups ) return;
-    float4 groupSize = groupSizeInfo[wid[warpId]];
-    const int groupData = __float_as_int(groupSize.w);
-    const uint begin = groupData & CRITMASK;
-    const uint numGroup = ((groupData & INVCMASK) >> CRITBIT) + 1;
-    float4 groupCenter = groupCenterInfo[wid[warpId]];
-    uint body_i = begin + laneId % numGroup;
-    float4 pos_i = tex1Dfetch(texBody,body_i);
-    float4 acc_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    traverse(pos_i, acc_i, groupCenter, groupSize, levelRange[2], shmem, lmem);
-    if( laneId < numGroup )
-      acc[body_i] = acc_i;
-  }
-}
-
-extern "C" __global__ void directKernel(float4 *bodyPos, float4 *bodyAcc, const int N) {
-  uint idx = min(blockIdx.x * blockDim.x + threadIdx.x, N-1);
-  float4 pos_i = bodyPos[idx];
-  float4 acc_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-  __shared__ float4 shmem[NTHREAD];
-  float4 *pos_j = shmem + WARP_SIZE * warpId;
-  const int numWarp = ALIGN(N, WARP_SIZE);
-  for( int jwarp=0; jwarp<numWarp; jwarp++ ) {
-    int jGlob = jwarp*WARP_SIZE+laneId;
-    pos_j[laneId] = bodyPos[min(jGlob,N-1)];
-    pos_j[laneId].w *= jGlob < N;
-    for( int i=0; i<WARP_SIZE; i++ )
-      P2P(acc_i, pos_i, pos_j[i]);
-  }
-  bodyAcc[idx] = acc_i;
-}
-
-void octree::traverse() {
-  nodeChild.tex("texNodeChild");
-  openingAngle.tex("texOpening");
-  multipole.tex("texMultipole");
-  bodyPos.tex("texBody");
-  workToDo.zeros();
-  traverseKernel<<<NBLOCK,NTHREAD,0,execStream>>>(
-    numGroups,
-    levelRange.devc(),
-    bodyAcc.devc(),
-    groupSizeInfo.devc(),
-    groupCenterInfo.devc(),
-    (int*)generalBuffer1.devc(),
-    workToDo.devc()
-  );
-}
-
-void octree::iterate() {
-  CU_SAFE_CALL(cudaStreamCreate(&execStream));
-  double t1 = get_time();
-  getBoundaries();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("BOUND : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  getKeys();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("INDEX : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  sortKeys();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("KEYS  : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  sortBodies();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("BODIES: %lf\n",get_time() - t1);;
-  t1 = get_time();
-  buildTree();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("BUILD : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  allocateTreePropMemory();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("ALLOC : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  linkTree();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("LINK  : %lf\n",get_time() - t1);;
-  t1 = get_time();
-  upward();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("UPWARD: %lf\n",get_time() - t1);;
-  t1 = get_time();
-  traverse();
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  printf("FMM   : %lf\n",get_time() - t1);;
-}
-
-void octree::direct() {
-  int blocks = ALIGN(numBodies/100, NTHREAD);
-  directKernel<<<blocks,NTHREAD,0,execStream>>>(bodyPos.devc(),bodyAcc2.devc(),numBodies);
-  CU_SAFE_CALL(cudaStreamSynchronize(execStream));
-  CU_SAFE_CALL(cudaStreamDestroy(execStream));
-}

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/cta.cuh

-/******************************************************************************
- * 
- * Copyright 2010-2011 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- ******************************************************************************/
-
-/******************************************************************************
- * Abstract CTA-processing functionality for partitioning downsweep
- * scan kernels
- ******************************************************************************/
-
-#pragma once
-
-#include <b40c/util/basic_utils.cuh>
-#include <b40c/util/device_intrinsics.cuh>
-#include <b40c/util/io/load_tile.cuh>
-#include <b40c/util/io/scatter_tile.cuh>
-
-namespace b40c {
-namespace partition {
-namespace downsweep {
-
-
-/**
- * Partitioning downsweep scan CTA
- *
- * Abstract class
- */
-template <
-	typename KernelPolicy,
-	typename DerivedCta,									// Derived CTA class
-	template <typename Policy> class Tile>			// Derived Tile class to use
-struct Cta
-{
-	//---------------------------------------------------------------------
-	// Typedefs and Constants
-	//---------------------------------------------------------------------
-
-	typedef typename KernelPolicy::KeyType 					KeyType;
-	typedef typename KernelPolicy::ValueType 				ValueType;
-	typedef typename KernelPolicy::SizeT 					SizeT;
-	typedef typename KernelPolicy::SmemStorage				SmemStorage;
-	typedef typename KernelPolicy::ByteGrid::LanePartial	LanePartial;
-
-	// Operational details type for short grid
-	typedef util::SrtsDetails<typename KernelPolicy::ByteGrid> 		ByteGridDetails;
-
-	typedef DerivedCta Dispatch;
-
-	//---------------------------------------------------------------------
-	// Members
-	//---------------------------------------------------------------------
-
-	// Shared storage for this CTA
-	typename KernelPolicy::SmemStorage 	&smem_storage;
-
-	// Input and output device pointers
-	KeyType								*d_in_keys;
-	KeyType								*d_out_keys;
-
-	ValueType							*d_in_values;
-	ValueType							*d_out_values;
-
-	// Operational details for scan grids
-	ByteGridDetails 					byte_grid_details;
-
-	SizeT								my_bin_carry;
-
-	KeyType 							*offset;
-	KeyType 							*next_offset;
-
-	//---------------------------------------------------------------------
-	// Methods
-	//---------------------------------------------------------------------
-
-	/**
-	 * Constructor
-	 */
-	__device__ __forceinline__ Cta(
-		SmemStorage 	&smem_storage,
-		KeyType 		*d_in_keys,
-		KeyType 		*d_out_keys,
-		ValueType 		*d_in_values,
-		ValueType 		*d_out_values,
-		SizeT 			*d_spine) :
-			smem_storage(smem_storage),
-			d_in_keys(d_in_keys),
-			d_out_keys(d_out_keys),
-			d_in_values(d_in_values),
-			d_out_values(d_out_values),
-			byte_grid_details(smem_storage.byte_raking_lanes),
-			offset(smem_storage.key_exchange + threadIdx.x + (threadIdx.x >> 5)),
-			next_offset(smem_storage.key_exchange + (threadIdx.x + 1) + ((threadIdx.x + 1) >> 5))
-	{
-
-		if (threadIdx.x < KernelPolicy::BINS) {
-
-			// Read bin_carry in parallel
-			int spine_bin_offset = (gridDim.x * threadIdx.x) + blockIdx.x;
-
-			my_bin_carry = tex1Dfetch(spine::SpineTex<SizeT>::ref, spine_bin_offset);
-
-			int2 item;
-			item.x = -1;
-			item.y = KernelPolicy::BINS;
-			smem_storage.bin_in_prefixes[threadIdx.x] = item;
-		}
-
-		if (threadIdx.x < CUB_WARP_THREADS(KernelPolicy::CUDA_ARCH)) {
-			smem_storage.warpscan[0][threadIdx.x] = 0;
-			smem_storage.warpscan[1][threadIdx.x] = 0;
-		}
-	}
-
-
-	/**
-	 * Process tile
-	 */
-	__device__ __forceinline__ void ProcessTile(
-		SizeT cta_offset,
-		const SizeT &guarded_elements = KernelPolicy::TILE_ELEMENTS)
-	{
-		Tile<KernelPolicy> tile;
-
-		tile.Partition(
-			cta_offset,
-			guarded_elements,
-			(Dispatch *) this);
-	}
-
-
-	/**
-	 * Process work range of tiles
-	 */
-	__device__ __forceinline__ void ProcessWorkRange(
-		util::CtaWorkLimits<SizeT> &work_limits)
-	{
-		// Make sure we get a local copy of the cta's offset (work_limits may be in smem)
-		SizeT pack_offset = smem_storage.packed_offset;
-
-		// Process full tiles of tile_elements
-		while (pack_offset < smem_storage.packed_offset_limit) {
-
-			ProcessTile(pack_offset);
-			pack_offset += (KernelPolicy::TILE_ELEMENTS / KernelPolicy::PACK_SIZE);
-		}
-
-/*
-		// Clean up last partial tile with guarded-io
-		if (work_limits.guarded_elements) {
-			ProcessTile(
-				pack_offset,
-				work_limits.guarded_elements);
-		}
-*/
-	}
-};
-
-
-} // namespace downsweep
-} // namespace partition
-} // namespace b40c
-

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/kernel_policy.cuh

-/******************************************************************************
- * 
- * Copyright 2010-2011 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- ******************************************************************************/
-
-/******************************************************************************
- * Configuration policy for partitioning downsweep scan kernels
- ******************************************************************************/
-
-#pragma once
-
-#include <b40c/util/cuda_properties.cuh>
-#include <b40c/util/basic_utils.cuh>
-#include <b40c/util/srts_grid.cuh>
-
-namespace b40c {
-namespace partition {
-namespace downsweep {
-
-
-/**
- * A detailed partitioning downsweep kernel configuration policy type that specializes
- * kernel code for a specific pass.  It encapsulates tuning configuration policy
- * details derived from TuningPolicy
- */
-template <typename TuningPolicy>
-struct KernelPolicy : TuningPolicy
-{
-	typedef typename TuningPolicy::SizeT 		SizeT;
-	typedef typename TuningPolicy::KeyType 		KeyType;
-	typedef typename TuningPolicy::ValueType 	ValueType;
-
-	enum {
-
-		BINS 							= 1 << TuningPolicy::LOG_BINS,
-		THREADS							= 1 << TuningPolicy::LOG_THREADS,
-
-		LOG_WARPS						= TuningPolicy::LOG_THREADS - CUB_LOG_WARP_THREADS(TuningPolicy::CUDA_ARCH),
-		WARPS							= 1 << LOG_WARPS,
-
-		LOAD_VEC_SIZE					= 1 << TuningPolicy::LOG_LOAD_VEC_SIZE,
-		LOADS_PER_TILE					= 1 << TuningPolicy::LOG_LOADS_PER_TILE,
-
-		LOG_TILE_ELEMENTS_PER_THREAD	= TuningPolicy::LOG_LOAD_VEC_SIZE + TuningPolicy::LOG_LOADS_PER_TILE,
-		TILE_ELEMENTS_PER_THREAD		= 1 << LOG_TILE_ELEMENTS_PER_THREAD,
-
-		LOG_TILE_ELEMENTS				= TuningPolicy::LOG_THREADS + LOG_TILE_ELEMENTS_PER_THREAD,
-		TILE_ELEMENTS					= 1 << LOG_TILE_ELEMENTS,
-	
-		LOG_SCAN_BINS					= (TuningPolicy::LOG_BINS > 3) ? 3 : TuningPolicy::LOG_BINS,
-		SCAN_BINS						= 1 << LOG_SCAN_BINS,
-
-		LOG_SCAN_LANES_PER_TILE			= CUB_MAX((LOG_SCAN_BINS - 2), 0),		// Always at least one lane per load
-		SCAN_LANES_PER_TILE				= 1 << LOG_SCAN_LANES_PER_TILE,
-
-		LOG_DEPOSITS_PER_LANE 			= TuningPolicy::LOG_THREADS + TuningPolicy::LOG_LOADS_PER_TILE,
-	};
-
-
-	// Smem SRTS grid type for reducing and scanning a tile of
-	// (bins/4) lanes of composite 8-bit bin counters
-	typedef util::SrtsGrid<
-		TuningPolicy::CUDA_ARCH,
-		int,											// Partial type
-		LOG_DEPOSITS_PER_LANE,							// Deposits per lane
-		LOG_SCAN_LANES_PER_TILE,						// Lanes (the number of composite digits)
-		TuningPolicy::LOG_RAKING_THREADS,				// Raking threads
-		false>											// Any prefix dependences between lanes are explicitly managed
-			ByteGrid;
-
-	
-	/**
-	 * Shared storage for partitioning upsweep
-	 */
-	struct SmemStorage
-	{
-		SizeT							packed_offset;
-		SizeT							packed_offset_limit;
-
-		bool 							non_trivial_pass;
-		util::CtaWorkLimits<SizeT> 		work_limits;
-
-		SizeT							bin_carry[BINS];
-		int2							bin_in_prefixes[BINS + 1];
-
-		// Storage for scanning local ranks
-		volatile int 					warpscan[2][CUB_WARP_THREADS(CUDA_ARCH) * 3 / 2];
-
-		union {
-			struct {
-				int 					byte_raking_lanes[ByteGrid::RAKING_ELEMENTS];
-				int						short_prefixes[2][ByteGrid::RAKING_THREADS];
-			};
-
-			int							bin_ex_prefixes[BINS + 1];
-
-			KeyType 					key_exchange[TILE_ELEMENTS + (TILE_ELEMENTS / 32)];			// Last index is for invalid elements to be culled (if any)
-		};
-	};
-
-	enum {
-		THREAD_OCCUPANCY					= CUB_SM_THREADS(CUDA_ARCH) >> TuningPolicy::LOG_THREADS,
-		SMEM_OCCUPANCY						= CUB_SMEM_BYTES(CUDA_ARCH) / sizeof(SmemStorage),
-		MAX_CTA_OCCUPANCY					= CUB_MIN(CUB_SM_CTAS(CUDA_ARCH), CUB_MIN(THREAD_OCCUPANCY, SMEM_OCCUPANCY)),
-
-		VALID								= (MAX_CTA_OCCUPANCY > 0),
-	};
-
-
-	__device__ __forceinline__ static void PreprocessKey(KeyType &key) {}
-
-	__device__ __forceinline__ static void PostprocessKey(KeyType &key) {}
-};
-	
-
-
-} // namespace downsweep
-} // namespace partition
-} // namespace b40c
-

gpu/include/b40c/radix_sort/downsweep/6bit_prmt/tile.cuh

-/******************************************************************************
- * 
- * Copyright 2010-2011 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- ******************************************************************************/
-
-/******************************************************************************
- * Abstract tile-processing functionality for partitioning downsweep scan
- * kernels
- ******************************************************************************/
-
-#pragma once
-
-#include <b40c/util/cuda_properties.cuh>
-#include <b40c/util/basic_utils.cuh>
-#include <b40c/util/io/modified_load.cuh>
-#include <b40c/util/io/modified_store.cuh>
-#include <b40c/util/io/load_tile.cuh>
-#include <b40c/util/io/scatter_tile.cuh>
-#include <b40c/util/reduction/serial_reduce.cuh>
-#include <b40c/util/scan/serial_scan.cuh>
-#include <b40c/util/scan/warp_scan.cuh>
-#include <b40c/util/device_intrinsics.cuh>
-#include <b40c/util/soa_tuple.cuh>
-#include <b40c/util/scan/soa/cooperative_soa_scan.cuh>
-
-namespace b40c {
-namespace partition {
-namespace downsweep {
-
-
-/**
- * Templated texture reference for keys
- */
-template <typename KeyType>
-struct KeysTex
-{
-	static texture<KeyType, cudaTextureType1D, cudaReadModeElementType> ref;
-};
-template <typename KeyType>
-texture<KeyType, cudaTextureType1D, cudaReadModeElementType> KeysTex<KeyType>::ref;
-
-
-
-/**
- * Tile
- *
- * Abstract class
- */
-template <
-	typename KernelPolicy,
-	typename DerivedTile>
-struct Tile
-{
-	//---------------------------------------------------------------------
-	// Typedefs and Constants
-	//---------------------------------------------------------------------
-
-	typedef typename KernelPolicy::KeyType 					KeyType;
-	typedef typename KernelPolicy::ValueType 				ValueType;
-	typedef typename KernelPolicy::SizeT 					SizeT;
-
-	typedef DerivedTile Dispatch;
-
-	enum {
-		LOAD_VEC_SIZE 				= KernelPolicy::LOAD_VEC_SIZE,
-		LOADS_PER_TILE 				= KernelPolicy::LOADS_PER_TILE,
-		TILE_ELEMENTS_PER_THREAD 	= KernelPolicy::TILE_ELEMENTS_PER_THREAD,
-
-		LOG_SCAN_LANES				= KernelPolicy::LOG_SCAN_LANES_PER_TILE,
-		SCAN_LANES					= KernelPolicy::SCAN_LANES_PER_TILE,
-
-		LOG_PACKS_PER_LOAD			= KernelPolicy::LOG_LOAD_VEC_SIZE - KernelPolicy::LOG_PACK_SIZE,
-		PACKS_PER_LOAD				= 1 << LOG_PACKS_PER_LOAD,
-
-		LANE_ROWS_PER_LOAD 			= KernelPolicy::ByteGrid::ROWS_PER_LANE / KernelPolicy::LOADS_PER_TILE,
-		LANE_STRIDE_PER_LOAD 		= KernelPolicy::ByteGrid::PADDED_PARTIALS_PER_ROW * LANE_ROWS_PER_LOAD,
-
-		INVALID_BIN					= -1,
-
-		LOG_RAKING_THREADS 			= KernelPolicy::ByteGrid::LOG_RAKING_THREADS,
-		RAKING_THREADS 				= 1 << LOG_RAKING_THREADS,
-
-		LOG_WARPSCAN_THREADS		= CUB_LOG_WARP_THREADS(CUDA_ARCH),
-		WARPSCAN_THREADS 			= 1 << LOG_WARPSCAN_THREADS,
-
-	};
-
-	//---------------------------------------------------------------------
-	// Members
-	//---------------------------------------------------------------------
-
-
-	// The keys (and values) this thread will read this tile
-	KeyType 	keys[LOADS_PER_TILE][LOAD_VEC_SIZE];
-	ValueType 	values[TILE_ELEMENTS_PER_THREAD];
-
-	// For each load:
-	// 		counts_nibbles contains the bin counts within nibbles ordered right to left
-	// 		bins_nibbles contains the bin for each key within nibbles ordered right to left
-	// 		load_prefix_bytes contains the exclusive scan for each key within nibbles ordered right to left
-
-	int 		bins_nibbles[(LOAD_VEC_SIZE + 7) / 8][LOADS_PER_TILE];
-
-	int 		counts_nibbles[SCAN_LANES / 2][LOADS_PER_TILE];
-	int			counts_bytes[SCAN_LANES][LOADS_PER_TILE];
-
-	int 		load_prefix_bytes[(LOAD_VEC_SIZE + 3) / 4][LOADS_PER_TILE];
-
-	int 		warpscan_shorts[LOADS_PER_TILE][4];
-
-	int 		local_ranks[LOADS_PER_TILE][LOAD_VEC_SIZE];		// The local rank of each key
-	SizeT 		scatter_offsets[LOADS_PER_TILE][LOAD_VEC_SIZE];	// The global rank of each key
-
-	int 		bins[TILE_ELEMENTS_PER_THREAD];
-
-
-	//---------------------------------------------------------------------
-	// Tile Methods
-	//---------------------------------------------------------------------
-
-	/**
-	 * ExtractRanks
-	 */
-	template <int LOAD, int VEC, int REM = (VEC & 7)>
-	struct ExtractRanks
-	{
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes) {}
-	};
-
-
-	/**
-	 * ExtractRanks (VEC % 8 == 0)
-	 */
-	template <int LOAD, int VEC>
-	struct ExtractRanks<LOAD, VEC, 0>
-	{
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes)
-		{
-/*
-			printf("\tTid(%d) Vec(%d) bins_nibbles(%08x)\n",
-				threadIdx.x, VEC, tile->bins_nibbles[VEC / 8][LOAD]);
-*/
-			// Decode prefix bytes for first four keys
-			tile->load_prefix_bytes[VEC / 4][LOAD] += util::PRMT(
-				tile->counts_bytes[0][LOAD],
-				tile->counts_bytes[1][LOAD],
-				tile->bins_nibbles[VEC / 8][LOAD]);
-
-			// Decode scan low and high packed words for first four keys
-			int warpscan_prefix[2];
-			warpscan_prefix[0] = util::PRMT(
-				tile->warpscan_shorts[LOAD][0],
-				tile->warpscan_shorts[LOAD][1],
-				tile->bins_nibbles[VEC / 8][LOAD]);
-
-			warpscan_prefix[1] = util::PRMT(
-				tile->warpscan_shorts[LOAD][2],
-				tile->warpscan_shorts[LOAD][3],
-				tile->bins_nibbles[VEC / 8][LOAD]);
-
-			// Low
-			int packed_scatter =
-				util::PRMT(								// Warpscan component (de-interleaved)
-					warpscan_prefix[0],
-					warpscan_prefix[1],
-					0x5140) +
-				util::PRMT(								// Raking scan component (lower bytes from each half)
-					tile->load_prefix_bytes[VEC / 4][LOAD],
-					0,
-					0x4140);
-
-			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
-			packed_scatter <<= 2;
-
-			tile->local_ranks[LOAD][VEC + 0] = packed_scatter & 0x0000ffff;
-			tile->local_ranks[LOAD][VEC + 1] = packed_scatter >> 16;
-
-			// High
-			packed_scatter =
-				util::PRMT(								// Warpscan component (de-interleaved)
-					warpscan_prefix[0],
-					warpscan_prefix[1],
-					0x7362) +
-				util::PRMT(								// Raking scan component (upper bytes from each half)
-					tile->load_prefix_bytes[VEC / 4][LOAD],
-					0,
-					0x4342);
-
-			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
-			packed_scatter <<= 2;
-
-			tile->local_ranks[LOAD][VEC + 2] = packed_scatter & 0x0000ffff;
-			tile->local_ranks[LOAD][VEC + 3] = packed_scatter >> 16;
-
-		}
-	};
-
-
-	/**
-	 * ExtractRanks (VEC % 8 == 4)
-	 */
-	template <int LOAD, int VEC>
-	struct ExtractRanks<LOAD, VEC, 4>
-	{
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void Invoke(Cta *cta, Tile *tile, const bool shift_bytes)
-		{
-			int upper_bins_nibbles = tile->bins_nibbles[VEC / 8][LOAD] >> 16;
-
-			// Decode prefix bytes for second four keys
-			tile->load_prefix_bytes[VEC / 4][LOAD] += util::PRMT(
-				tile->counts_bytes[0][LOAD],
-				tile->counts_bytes[1][LOAD],
-				upper_bins_nibbles);
-
-			// Decode scan low and high packed words for second four keys
-			int warpscan_prefix[2];
-			warpscan_prefix[0] = util::PRMT(
-				tile->warpscan_shorts[LOAD][0],
-				tile->warpscan_shorts[LOAD][1],
-				upper_bins_nibbles);
-
-			warpscan_prefix[1] = util::PRMT(
-				tile->warpscan_shorts[LOAD][2],
-				tile->warpscan_shorts[LOAD][3],
-				upper_bins_nibbles);
-
-			// Low
-			int packed_scatter =
-				util::PRMT(								// Warpscan component (de-interleaved)
-					warpscan_prefix[0],
-					warpscan_prefix[1],
-					0x5140) +
-				util::PRMT(								// Raking scan component (lower bytes from each half)
-					tile->load_prefix_bytes[VEC / 4][LOAD],
-					0,
-					0x4140);
-
-			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
-			packed_scatter <<= 2;
-
-			tile->local_ranks[LOAD][VEC + 0] = packed_scatter & 0x0000ffff;
-			tile->local_ranks[LOAD][VEC + 1] = packed_scatter >> 16;
-
-			// High
-			packed_scatter =
-				util::PRMT(								// Warpscan component (de-interleaved)
-					warpscan_prefix[0],
-					warpscan_prefix[1],
-					0x7362) +
-				util::PRMT(								// Raking scan component (upper bytes from each half)
-					tile->load_prefix_bytes[VEC / 4][LOAD],
-					0,
-					0x4342);
-
-			packed_scatter = util::SHR_ADD(0xffe0ffe0 & packed_scatter, 5, packed_scatter);
-			packed_scatter <<= 2;
-
-			tile->local_ranks[LOAD][VEC + 2] = packed_scatter & 0x0000ffff;
-			tile->local_ranks[LOAD][VEC + 3] = packed_scatter >> 16;
-		}
-	};
-
-
-
-	//---------------------------------------------------------------------
-	// IterateTileElements Structures
-	//---------------------------------------------------------------------
-
-	/**
-	 * Iterate next vector element
-	 */
-	template <int LOAD, int VEC, int dummy = 0>
-	struct IterateTileElements
-	{
-		// DecodeKeys
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void DecodeKeys(
-			Cta *cta,
-			Tile *tile,
-			const int CURRENT_BIT)
-		{
-			// Decode the bin for this key
-			int bin = util::BFE(
-				tile->keys[LOAD][VEC],
-				CURRENT_BIT,
-				KernelPolicy::LOG_SCAN_BINS);
-
-			const int BITS_PER_NIBBLE = 4;
-			int shift = bin * BITS_PER_NIBBLE;
-
-			// Initialize exclusive scan bytes
-			if (VEC == 0) {
-
-				tile->load_prefix_bytes[VEC / 4][LOAD] = 0;
-
-			} else {
-				int prev_counts_nibbles = tile->counts_nibbles[0][LOAD] >> shift;
-
-				if ((VEC & 3) == 0) {
-
-					tile->load_prefix_bytes[VEC / 4][LOAD] = prev_counts_nibbles & 0xf;
-
-				} else if ((VEC & 7) < 4) {
-
-					util::BFI(
-						tile->load_prefix_bytes[VEC / 4][LOAD],
-						tile->load_prefix_bytes[VEC / 4][LOAD],
-						prev_counts_nibbles,
-						8 * (VEC & 7),
-						BITS_PER_NIBBLE);
-
-				} else {
-
-					util::BFI(
-						tile->load_prefix_bytes[VEC / 4][LOAD],
-						tile->load_prefix_bytes[VEC / 4][LOAD],
-						prev_counts_nibbles,
-						8 * ((VEC & 7) - 4),
-						BITS_PER_NIBBLE);
-				}
-			}
-
-			// Initialize counts nibbles
-			if (VEC == 0) {
-				tile->counts_nibbles[0][LOAD] = 1 << shift;
-
-			} else if (VEC == LOAD_VEC_SIZE - 1) {
-
-				// last vector element
-				if ((VEC & 15) == 15) {
-
-					// Protect overflow: expand nibbles into bytes and then add
-					util::NibblesToBytes(
-						tile->counts_bytes[0][LOAD],
-						tile->counts_bytes[1][LOAD],
-						tile->counts_nibbles[0][LOAD]);
-
-					shift = shift * 2;
-					util::SHL_ADD(
-						tile->counts_bytes[0][LOAD],
-						1,
-						shift,
-						tile->counts_bytes[0][LOAD]);
-
-					util::SHL_ADD(
-						tile->counts_bytes[1][LOAD],
-						1,
-						shift - 32,
-						tile->counts_bytes[1][LOAD]);
-
-				} else {
-
-					// Add nibble then expand into bytes
-					util::SHL_ADD(
-						tile->counts_nibbles[0][LOAD],
-						1,
-						shift,
-						tile->counts_nibbles[0][LOAD]);
-
-					util::NibblesToBytes(
-						tile->counts_bytes[0][LOAD],
-						tile->counts_bytes[1][LOAD],
-						tile->counts_nibbles[0][LOAD]);
-				}
-
-			} else {
-				util::SHL_ADD(
-					tile->counts_nibbles[0][LOAD],
-					1,
-					shift,
-					tile->counts_nibbles[0][LOAD]);
-			}
-
-			// Initialize bins nibbles
-			if ((VEC & 7) == 0) {
-				tile->bins_nibbles[VEC / 8][LOAD] = bin;
-
-			} else {
-				util::BFI(
-					tile->bins_nibbles[VEC / 8][LOAD],
-					tile->bins_nibbles[VEC / 8][LOAD],
-					bin,
-					4 * (VEC & 7),
-					4);
-			}
-
-			// Next vector element
-			IterateTileElements<LOAD, VEC + 1>::DecodeKeys(cta, tile, CURRENT_BIT);
-		}
-
-		// ComputeRanks
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes)
-		{
-			if (VEC == 0) {
-
-				const int LANE_OFFSET = LOAD * LANE_STRIDE_PER_LOAD;
-
-				// Extract prefix bytes from bytes raking grid
-				tile->counts_bytes[0][LOAD] = cta->byte_grid_details.lane_partial[0][LANE_OFFSET];
-				tile->counts_bytes[1][LOAD] = cta->byte_grid_details.lane_partial[1][LANE_OFFSET];
-
-				// Extract warpscan shorts
-				const int LOAD_RAKING_TID_OFFSET = (KernelPolicy::THREADS * LOAD) >> KernelPolicy::ByteGrid::LOG_PARTIALS_PER_SEG;
-
-				int base_raking_tid = threadIdx.x >> KernelPolicy::ByteGrid::LOG_PARTIALS_PER_SEG;
-
-				tile->warpscan_shorts[LOAD][0] = cta->smem_storage.short_prefixes[0][base_raking_tid + LOAD_RAKING_TID_OFFSET];
-				tile->warpscan_shorts[LOAD][1] = cta->smem_storage.short_prefixes[1][base_raking_tid + LOAD_RAKING_TID_OFFSET];
-				tile->warpscan_shorts[LOAD][2] = cta->smem_storage.short_prefixes[0][base_raking_tid + LOAD_RAKING_TID_OFFSET + (RAKING_THREADS / 2)];
-				tile->warpscan_shorts[LOAD][3] = cta->smem_storage.short_prefixes[1][base_raking_tid + LOAD_RAKING_TID_OFFSET + (RAKING_THREADS / 2)];
-			}
-
-			ExtractRanks<LOAD, VEC>::Invoke(cta, tile, shift_bytes);
-/*
-			printf("tid(%d) vec(%d) key(%d) scatter(%d)\n",
-				threadIdx.x,
-				VEC,
-				tile->keys[LOAD][VEC],
-				tile->local_ranks[LOAD][VEC] / 4);
-*/
-			// Next vector element
-			IterateTileElements<LOAD, VEC + 1>::ComputeRanks(cta, tile, shift_bytes);
-		}
-	};
-
-
-
-	/**
-	 * IterateTileElements next load
-	 */
-	template <int LOAD, int dummy>
-	struct IterateTileElements<LOAD, LOAD_VEC_SIZE, dummy>
-	{
-		// DecodeKeys
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void DecodeKeys(
-			Cta *cta,
-			Tile *tile,
-			const int CURRENT_BIT)
-		{
-			const int LANE_OFFSET = LOAD * LANE_STRIDE_PER_LOAD;
-
-			// Place keys into raking grid
-			cta->byte_grid_details.lane_partial[0][LANE_OFFSET] = tile->counts_bytes[0][LOAD];
-			cta->byte_grid_details.lane_partial[1][LANE_OFFSET] = tile->counts_bytes[1][LOAD];
-/*
-			printf("Tid %u load %u:\t,"
-				"load_prefix_bytes[0](%08x), "
-				"load_prefix_bytes[1](%08x), "
-				"counts_bytes[0](%08x), "
-				"counts_bytes[1](%08x), "
-				"\n",
-				threadIdx.x, LOAD,
-				tile->load_prefix_bytes[0][LOAD],
-				tile->load_prefix_bytes[1][LOAD],
-				tile->counts_bytes[0][LOAD],
-				tile->counts_bytes[1][LOAD]);
-*/
-			// First vector element, next load
-			IterateTileElements<LOAD + 1, 0>::DecodeKeys(cta, tile, CURRENT_BIT);
-		}
-
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes)
-		{
-			// First vector element, next load
-			IterateTileElements<LOAD + 1, 0>::ComputeRanks(cta, tile, shift_bytes);
-		}
-
-	};
-
-	/**
-	 * Terminate iteration
-	 */
-	template <int dummy>
-	struct IterateTileElements<LOADS_PER_TILE, 0, dummy>
-	{
-		// DecodeKeys
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void DecodeKeys(Cta *cta, Tile *tile, const int CURRENT_BIT) {}
-
-		// ExtractRanks
-		template <typename Cta, typename Tile>
-		static __device__ __forceinline__ void ComputeRanks(Cta *cta, Tile *tile, const bool shift_bytes) {}
-	};
-
-
-
-	//---------------------------------------------------------------------
-	// Tile Internal Methods
-	//---------------------------------------------------------------------
-
-
-	/**
-	 * Scan Tile
-	 */
-	template <typename Cta>
-	__device__ __forceinline__ void ScanTile(Cta *cta, const int CURRENT_BIT, const bool shift_bytes)
-	{
-
-		// Decode bins and place keys into grid
-		IterateTileElements<0, 0>::DecodeKeys(cta, this, CURRENT_BIT);
-
-		__syncthreads();
-
-		int tid = threadIdx.x & 31;
-		int warp = threadIdx.x >> 5;
-		volatile int *p = cta->smem_storage.short_prefixes[warp];
-		volatile int *p2 = &cta->smem_storage.short_prefixes[warp][tid * 2];
-		volatile int *warpscan = cta->smem_storage.warpscan[warp];
-
-		// Use our raking threads to, in aggregate, scan the composite counter lanes
-		if (threadIdx.x < RAKING_THREADS) {
-
-/*
-			if (threadIdx.x == 0) {
-				printf("ByteGrid:\n");
-				KernelPolicy::ByteGrid::Print();
-				printf("\n");
-			}
-*/
-			// Upsweep rake
-			int partial_bytes = util::scan::SerialScan<KernelPolicy::ByteGrid::PARTIALS_PER_SEG>::Invoke(
-				cta->byte_grid_details.raking_segment,
-				0);