Commits

Rio Yokota committed d74208c

Prepare to introduce 2D kernels.

  • Participants
  • Parent commits c8cee38

Comments (0)

Files changed (11)

 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 # All options are set here
+SET(DIMENSION 3D) #{2D, 3D}
 SET(EQUATION Laplace) # {Laplace, Yukawa, Helmholtz, Stokes}
 SET(BASIS Cartesian) # {Cartesian, Spherical, Rotation, Planewave}
 OPTION(USE_MPI "Use MPI" ON)
 
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wshadow -Wuninitialized -O3")
 INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include)
+ADD_DEFINITIONS(-D${DIMENSION})
 ADD_DEFINITIONS(-D${EQUATION})
 ADD_DEFINITIONS(-D${BASIS})
 ADD_DEFINITIONS(-D${DEVICE})
 .SUFFIXES: .cxx .cu .o
 
+### choose dimension
+#DIMENSION = 2D # (only works with Laplace Cartesian)
+DIMENSION = 3D
+
 ### choose kernel
 EQUATION = Laplace
 #EQUATION = Yukawa (not available yet)
 
 ### choose device to use
 DEVICE	= CPU
-#DEVICE	= GPU (not integrated)
+#DEVICE	= GPU (not available yet)
 
 ### choose C++ compiler
 CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -msse4a -ffast-math -funroll-loops # GCC
 CXX	+= -I../include
 LFLAGS	+= -D$(BASIS)
 LFLAGS  += -DEXPANSION=4 # Specifcy expansion order
+LFLAGS	+= -DUSE_SIMD # Use SIMD intrinsic kernels
 #LFLAGS	+= -DERROR_OPT # Use error optimized theta
 #LFLAGS	+= -DUSE_RMAX # Use Rmax in multipole acceptance criteria
 LFLAGS	+= -DDUAL # Use dual tree traversal (turn off every option above before turning this off)
 #LFLAGS	+= -DTRACE
 
 ### VTK flags : VTK is available from http://www.vtk.org/VTK/resources/software.html
-#VTK_INCLUDE_PATH = /usr/include/vtk-5.8
-#VTK_LIBRARY_PATH = /usr/lib/vtk-5.8
-#CXX	+= -I$(VTK_INCLUDE_PATH)
-#LFLAGS	+= -L$(VTK_LIBRARY_PATH) -DVTK -lvtkCharts -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO
+#LFLAGS	+= -DVTK -lvtkCharts -lvtkRendering -lvtkGraphics -lvtkFiltering -lvtkViews -lvtkCommon -lvtkWidgets -lvtkIO
 
 ifeq ($(DEVICE),GPU)
-### CUDA path
-CUDA_INSTALL_PATH = /usr/local/cuda
-CUDA_SDK_PATH = /usr/local/cuda_sdk/C
 ### CUDA compiler
-NVCC	= nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include
+NVCC	= nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3 -use_fast_math -arch=sm_21 -I../include
 ### CUDA flags
-LFLAGS  += -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart -lstdc++ -ldl -lm
-SOURCES	= ../kernels/$(EQUATION)$(BASIS)$(DEVICE).cu ../kernels/$(EQUATION)P2P$(DEVICE).cu
+LFLAGS  += -lcuda -lcudart -lstdc++ -ldl -lm
+SOURCES	= ../kernels/$(DIMENSION)$(EQUATION)$(BASIS)$(DEVICE).cu ../kernels/$(DIMENSION)$(EQUATION)P2P$(DEVICE).cu
 OBJECTS	= $(SOURCES:.cu=.o)
 else
-SOURCES	= ../kernels/$(EQUATION)$(BASIS)$(DEVICE).cxx ../kernels/$(EQUATION)P2P$(DEVICE).cxx
+SOURCES	= ../kernels/$(DIMENSION)$(EQUATION)$(BASIS)$(DEVICE).cxx ../kernels/$(DIMENSION)$(EQUATION)P2P$(DEVICE).cxx
 OBJECTS	= $(SOURCES:.cxx=.o)
 endif
 
 This file is included from each Makefile in all subdirectories.
 Most options of exafmm can be controlled from this file.
 
+DIMENSION : 2D or 3D
 EQUATION : Type of equation to solve
 BASIS : Type of multipole/local expansion basis
 DEVICE : CPU or GPU

kernels/2DLaplaceP2PCPU.cxx

+#include "kernel.h"
+#include "simd.h"
+const real_t EPS2 = 0.0;                                        //!< Softening parameter (squared)
+
+void Kernel::P2P(C_iter Ci, C_iter Cj, bool mutual) const {
+  B_iter Bi = Ci->BODY;
+  B_iter Bj = Cj->BODY;
+  int ni = Ci->NDBODY;
+  int nj = Cj->NDBODY;
+  int i = 0;
+#if USE_SIMD
+  for ( ; i<=ni-NSIMD; i+=NSIMD) {
+    simdvec zero = 0;
+    ksimdvec pot = zero;
+    ksimdvec ax = zero;
+    ksimdvec ay = zero;
+    ksimdvec az = zero;
+
+    simdvec xi = SIMD<simdvec,0,NSIMD>::setBody(Bi,i);
+    simdvec yi = SIMD<simdvec,1,NSIMD>::setBody(Bi,i);
+    simdvec zi = SIMD<simdvec,2,NSIMD>::setBody(Bi,i);
+    simdvec mi = SIMD<simdvec,3,NSIMD>::setBody(Bi,i);
+    simdvec R2 = EPS2;
+
+    simdvec xj = Xperiodic[0];
+    xi -= xj;
+    simdvec yj = Xperiodic[1];
+    yi -= yj;
+    simdvec zj = Xperiodic[2];
+    zi -= zj;
+
+    simdvec x2 = Bj[0].X[0];
+    x2 -= xi;
+    simdvec y2 = Bj[0].X[1];
+    y2 -= yi;
+    simdvec z2 = Bj[0].X[2];
+    z2 -= zi;
+    simdvec mj = Bj[0].SRC;
+
+    xj = x2;
+    R2 += x2 * x2;
+    yj = y2;
+    R2 += y2 * y2;
+    zj = z2;
+    R2 += z2 * z2;
+    simdvec invR;
+
+    x2 = Bj[1].X[0];
+    y2 = Bj[1].X[1];
+    z2 = Bj[1].X[2];
+    for (int j=0; j<nj-2; j++) {
+      invR = rsqrt(R2);
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      if (mutual) Bj[j].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = Bj[j+1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      if (mutual) Bj[j].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+      x2 = Bj[j+2].X[0];
+
+      yj *= invR;
+      ay += yj;
+      if (mutual) Bj[j].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+      y2 = Bj[j+2].X[1];
+
+      zj *= invR;
+      az += zj;
+      if (mutual) Bj[j].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+      z2 = Bj[j+2].X[2];
+    }
+    if ( nj > 1 ) {
+      invR = rsqrt(R2);
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      if (mutual) Bj[nj-2].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = Bj[nj-1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      if (mutual) Bj[nj-2].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+
+      yj *= invR;
+      ay += yj;
+      if (mutual) Bj[nj-2].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+
+      zj *= invR;
+      az += zj;
+      if (mutual) Bj[nj-2].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+    }
+    invR = rsqrt(R2);
+    invR &= R2 > zero;
+    mj *= invR * mi;
+    pot += mj;
+    if (mutual) Bj[nj-1].TRG[0] += sum(mj);
+    invR = invR * invR * mj;
+
+    xj *= invR;
+    ax += xj;
+    if (mutual) Bj[nj-1].TRG[1] -= sum(xj);
+    yj *= invR;
+    ay += yj;
+    if (mutual) Bj[nj-1].TRG[2] -= sum(yj);
+    zj *= invR;
+    az += zj;
+    if (mutual) Bj[nj-1].TRG[3] -= sum(zj);
+    for (int k=0; k<NSIMD; k++) {
+      Bi[i+k].TRG[0] += transpose(pot,k);
+      Bi[i+k].TRG[1] += transpose(ax,k);
+      Bi[i+k].TRG[2] += transpose(ay,k);
+      Bi[i+k].TRG[3] += transpose(az,k);
+    }
+  }
+#endif
+  for ( ; i<ni; i++) {
+    kreal_t pot = 0; 
+    kreal_t ax = 0;
+    kreal_t ay = 0;
+    kreal_t az = 0;
+    for (int j=0; j<nj; j++) {
+      vec3 dX = Bi[i].X - Bj[j].X - Xperiodic;
+      real_t R2 = norm(dX) + EPS2;
+      if (R2 != 0) {
+        real_t invR2 = 1.0 / R2;
+        real_t invR = Bi[i].SRC * Bj[j].SRC * sqrt(invR2);
+        dX *= invR2 * invR;
+        pot += invR;
+        ax += dX[0];
+        ay += dX[1];
+        az += dX[2];
+        if (mutual) {
+          Bj[j].TRG[0] += invR;
+          Bj[j].TRG[1] += dX[0];
+          Bj[j].TRG[2] += dX[1];
+          Bj[j].TRG[3] += dX[2];
+        }
+      }
+    }
+    Bi[i].TRG[0] += pot;
+    Bi[i].TRG[1] -= ax;
+    Bi[i].TRG[2] -= ay;
+    Bi[i].TRG[3] -= az;
+  }
+}
+
+void Kernel::P2P(C_iter C) const {
+  B_iter B = C->BODY;
+  int n = C->NDBODY;
+  int i = 0;
+#if USE_SIMD
+  for ( ; i<=n-NSIMD; i+=NSIMD) {
+    simdvec zero = 0;
+    ksimdvec pot = zero;
+    ksimdvec ax = zero;
+    ksimdvec ay = zero;
+    ksimdvec az = zero;
+
+    simdvec index = SIMD<simdvec,0,NSIMD>::setIndex(i);
+    simdvec xi = SIMD<simdvec,0,NSIMD>::setBody(B,i);
+    simdvec yi = SIMD<simdvec,1,NSIMD>::setBody(B,i);
+    simdvec zi = SIMD<simdvec,2,NSIMD>::setBody(B,i);
+    simdvec mi = SIMD<simdvec,3,NSIMD>::setBody(B,i);
+    simdvec R2 = EPS2;
+
+    simdvec xj = Xperiodic[0];
+    xi -= xj;
+    simdvec yj = Xperiodic[1];
+    yi -= yj;
+    simdvec zj = Xperiodic[2];
+    zi -= zj;
+
+    simdvec x2 = B[i+1].X[0];
+    x2 -= xi;
+    simdvec y2 = B[i+1].X[1];
+    y2 -= yi;
+    simdvec z2 = B[i+1].X[2];
+    z2 -= zi;
+    simdvec mj = B[i+1].SRC;
+
+    xj = x2;
+    R2 += x2 * x2;
+    yj = y2;
+    R2 += y2 * y2;
+    zj = z2;
+    R2 += z2 * z2;
+    simdvec invR;
+
+    x2 = B[i+2].X[0];
+    y2 = B[i+2].X[1];
+    z2 = B[i+2].X[2];
+    for (int j=i+1; j<n-2; j++) {
+      invR = rsqrt(R2);
+      invR &= index < j;
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      B[j].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = B[j+1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      B[j].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+      x2 = B[j+2].X[0];
+
+      yj *= invR;
+      ay += yj;
+      B[j].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+      y2 = B[j+2].X[1];
+
+      zj *= invR;
+      az += zj;
+      B[j].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+      z2 = B[j+2].X[2];
+    }
+    if ( n-2 > i ) {
+      invR = rsqrt(R2);
+      invR &= index < n-2;
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      B[n-2].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = B[n-1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      B[n-2].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+
+      yj *= invR;
+      ay += yj;
+      B[n-2].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+
+      zj *= invR;
+      az += zj;
+      B[n-2].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+    }
+    invR = rsqrt(R2);
+    invR &= index < n-1;
+    invR &= R2 > zero;
+    mj *= invR * mi;
+    pot += mj;
+    B[n-1].TRG[0] += sum(mj);
+    invR = invR * invR * mj;
+
+    xj *= invR;
+    ax += xj;
+    B[n-1].TRG[1] -= sum(xj);
+    yj *= invR;
+    ay += yj;
+    B[n-1].TRG[2] -= sum(yj);
+    zj *= invR;
+    az += zj;
+    B[n-1].TRG[3] -= sum(zj);
+    for (int k=0; k<NSIMD; k++) {
+      B[i+k].TRG[0] += transpose(pot,k);
+      B[i+k].TRG[1] += transpose(ax,k);
+      B[i+k].TRG[2] += transpose(ay,k);
+      B[i+k].TRG[3] += transpose(az,k);
+    }
+  }
+#endif
+  for ( ; i<n; i++) {
+    kreal_t pot = 0;
+    kreal_t ax = 0;
+    kreal_t ay = 0;
+    kreal_t az = 0;
+    for (int j=i+1; j<n; j++) {
+      vec3 dX = B[i].X - B[j].X;
+      real_t R2 = norm(dX) + EPS2;
+      if (R2 != 0) {
+        real_t invR2 = 1.0 / R2;
+        real_t invR = B[i].SRC * B[j].SRC * sqrt(invR2);
+        dX *= invR2 * invR;
+        pot += invR;
+        ax += dX[0];
+        ay += dX[1];
+        az += dX[2];
+        B[j].TRG[0] += invR;
+        B[j].TRG[1] += dX[0];
+        B[j].TRG[2] += dX[1];
+        B[j].TRG[3] += dX[2];
+      }
+    }
+    B[i].TRG[0] += pot;
+    B[i].TRG[1] -= ax;
+    B[i].TRG[2] -= ay;
+    B[i].TRG[3] -= az;
+  }
+}

kernels/3DLaplaceCartesianCPU.cxx

+#include "kernel.h"
+const real_t EPS = 1e-12;                                       // Double precision epsilon
+
+template<int nx, int ny, int nz>
+struct Index {
+  static const int                I = Index<nx,ny+1,nz-1>::I + 1;
+  static const unsigned long long F = Index<nx,ny,nz-1>::F * nz;
+};
+
+template<int nx, int ny>
+struct Index<nx,ny,0> {
+  static const int                I = Index<nx+1,0,ny-1>::I + 1;
+  static const unsigned long long F = Index<nx,ny-1,0>::F * ny;
+};
+
+template<int nx>
+struct Index<nx,0,0> {
+  static const int                I = Index<0,0,nx-1>::I + 1;
+  static const unsigned long long F = Index<nx-1,0,0>::F * nx;
+};
+
+template<>
+struct Index<0,0,0> {
+  static const int                I = 0;
+  static const unsigned long long F = 1;
+};
+
+
+template<int n, int kx, int ky , int kz, int d>
+struct DerivativeTerm {
+  static const int coef = 1 - 2 * n;
+  static inline real_t kernel(const vecP &C, const vec3 &dX) {
+    return coef * dX[d] * C[Index<kx,ky,kz>::I];
+  }
+};
+
+template<int n, int kx, int ky , int kz>
+struct DerivativeTerm<n,kx,ky,kz,-1> {
+  static const int coef = 1 - n;
+  static inline real_t kernel(const vecP &C, const vec3&) {
+    return coef * C[Index<kx,ky,kz>::I];
+  }
+};
+
+
+template<int nx, int ny, int nz, int kx=nx, int ky=ny, int kz=nz, int flag=5>
+struct DerivativeSum {
+  static const int nextflag = 5 - (kz < nz || kz == 1);
+  static const int dim = kz == (nz-1) ? -1 : 2;
+  static const int n = nx + ny + nz;
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,nx,ny,kz-1,nextflag>::loop(C,dX)
+         + DerivativeTerm<n,nx,ny,kz-1,dim>::kernel(C,dX);
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky, int kz>
+struct DerivativeSum<nx,ny,nz,kx,ky,kz,4> {
+  static const int nextflag = 3 - (ny == 0);
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dX);
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky, int kz>
+struct DerivativeSum<nx,ny,nz,kx,ky,kz,3> {
+  static const int nextflag = 3 - (ky < ny || ky == 1);
+  static const int dim = ky == (ny-1) ? -1 : 1;
+  static const int n = nx + ny + nz;
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,nx,ky-1,nz,nextflag>::loop(C,dX)
+         + DerivativeTerm<n,nx,ky-1,nz,dim>::kernel(C,dX);
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky, int kz>
+struct DerivativeSum<nx,ny,nz,kx,ky,kz,2> {
+  static const int nextflag = 1 - (nx == 0);
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dX);
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky, int kz>
+struct DerivativeSum<nx,ny,nz,kx,ky,kz,1> {
+  static const int nextflag = 1 - (kx < nx || kx == 1);
+  static const int dim = kx == (nx-1) ? -1 : 0;
+  static const int n = nx + ny + nz;
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,kx-1,ny,nz,nextflag>::loop(C,dX)
+         + DerivativeTerm<n,kx-1,ny,nz,dim>::kernel(C,dX);
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky, int kz>
+struct DerivativeSum<nx,ny,nz,kx,ky,kz,0> {
+  static inline real_t loop(const vecP&, const vec3&) {
+    return 0;
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky>
+struct DerivativeSum<nx,ny,nz,kx,ky,0,5> {
+  static inline real_t loop(const vecP &C, const vec3 &dX) {
+    return DerivativeSum<nx,ny,nz,nx,ny,0,4>::loop(C,dX);
+  }
+};
+
+
+template<int nx, int ny, int nz, int kx=nx, int ky=ny, int kz=nz>
+struct MultipoleSum {
+  static inline real_t kernel(const vecP &C, const vecP &M) {
+    return MultipoleSum<nx,ny,nz,kx,ky,kz-1>::kernel(C,M)
+         + C[Index<nx-kx,ny-ky,nz-kz>::I]*M[Index<kx,ky,kz>::I];
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky>
+struct MultipoleSum<nx,ny,nz,kx,ky,0> {
+  static inline real_t kernel(const vecP &C, const vecP &M) {
+    return MultipoleSum<nx,ny,nz,kx,ky-1,nz>::kernel(C,M)
+         + C[Index<nx-kx,ny-ky,nz>::I]*M[Index<kx,ky,0>::I];
+  }
+};
+
+template<int nx, int ny, int nz, int kx>
+struct MultipoleSum<nx,ny,nz,kx,0,0> {
+  static inline real_t kernel(const vecP &C, const vecP &M) {
+    return MultipoleSum<nx,ny,nz,kx-1,ny,nz>::kernel(C,M)
+         + C[Index<nx-kx,ny,nz>::I]*M[Index<kx,0,0>::I];
+  }
+};
+
+template<int nx, int ny, int nz>
+struct MultipoleSum<nx,ny,nz,0,0,0> {
+  static inline real_t kernel(const vecP&, const vecP&) { return 0; }
+};
+
+
+template<int nx, int ny, int nz, int kx=0, int ky=0, int kz=P-1-nx-ny-nz>
+struct LocalSum {
+  static inline real_t kernel(const vecP &M, const vecP &L) {
+    return LocalSum<nx,ny,nz,kx,ky+1,kz-1>::kernel(M,L)
+         + M[Index<kx,ky,kz>::I] * L[Index<nx+kx,ny+ky,nz+kz>::I];
+  }
+};
+
+template<int nx, int ny, int nz, int kx, int ky>
+struct LocalSum<nx,ny,nz,kx,ky,0> {
+  static inline real_t kernel(const vecP &M, const vecP &L) {
+    return LocalSum<nx,ny,nz,kx+1,0,ky-1>::kernel(M,L)
+         + M[Index<kx,ky,0>::I] * L[Index<nx+kx,ny+ky,nz>::I];
+  }
+};
+
+template<int nx, int ny, int nz, int kx>
+struct LocalSum<nx,ny,nz,kx,0,0> {
+  static inline real_t kernel(const vecP &M, const vecP &L) {
+    return LocalSum<nx,ny,nz,0,0,kx-1>::kernel(M,L)
+         + M[Index<kx,0,0>::I] * L[Index<nx+kx,ny,nz>::I];
+  }
+};
+
+template<int nx, int ny, int nz>
+struct LocalSum<nx,ny,nz,0,0,0> {
+  static inline real_t kernel(const vecP&, const vecP&) { return 0; }
+};
+
+
+template<int nx, int ny, int nz>
+struct Kernels {
+  static inline void power(vecP &C, const vec3 &dX) {
+    Kernels<nx,ny+1,nz-1>::power(C,dX);
+    C[Index<nx,ny,nz>::I] = C[Index<nx,ny,nz-1>::I] * dX[2] / nz;
+  }
+  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
+    static const int n = nx + ny + nz;
+    Kernels<nx,ny+1,nz-1>::derivative(C,dX,invR2);
+    C[Index<nx,ny,nz>::I] = DerivativeSum<nx,ny,nz>::loop(C,dX) / n * invR2;
+  }
+  static inline void scale(vecP &C) {
+    Kernels<nx,ny+1,nz-1>::scale(C);
+    C[Index<nx,ny,nz>::I] *= Index<nx,ny,nz>::F;
+  }
+  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
+    Kernels<nx,ny+1,nz-1>::M2M(MI,C,MJ);
+    MI[Index<nx,ny,nz>::I] += MultipoleSum<nx,ny,nz>::kernel(C,MJ);
+  }
+  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
+    Kernels<nx,ny+1,nz-1>::M2L(L,C,M);
+    L[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(M,C);
+  }
+  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
+    Kernels<nx,ny+1,nz-1>::L2L(LI,C,LJ);
+    LI[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(C,LJ);
+  }
+  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
+    Kernels<nx,ny+1,nz-1>::L2P(B,C,L);
+    B->TRG[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(C,L);
+  }
+};
+
+template<int nx, int ny>
+struct Kernels<nx,ny,0> {
+  static inline void power(vecP &C, const vec3 &dX) {
+    Kernels<nx+1,0,ny-1>::power(C,dX);
+    C[Index<nx,ny,0>::I] = C[Index<nx,ny-1,0>::I] * dX[1] / ny;
+  }
+  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
+    static const int n = nx + ny;
+    Kernels<nx+1,0,ny-1>::derivative(C,dX,invR2);
+    C[Index<nx,ny,0>::I] = DerivativeSum<nx,ny,0>::loop(C,dX) / n * invR2;
+  }
+  static inline void scale(vecP &C) {
+    Kernels<nx+1,0,ny-1>::scale(C);
+    C[Index<nx,ny,0>::I] *= Index<nx,ny,0>::F;
+  }
+  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
+    Kernels<nx+1,0,ny-1>::M2M(MI,C,MJ);
+    MI[Index<nx,ny,0>::I] += MultipoleSum<nx,ny,0>::kernel(C,MJ);
+  }
+  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
+    Kernels<nx+1,0,ny-1>::M2L(L,C,M);
+    L[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(M,C);
+  }
+  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
+    Kernels<nx+1,0,ny-1>::L2L(LI,C,LJ);
+    LI[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(C,LJ);
+  }
+  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
+    Kernels<nx+1,0,ny-1>::L2P(B,C,L);
+    B->TRG[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(C,L);
+  }
+};
+
+template<int nx>
+struct Kernels<nx,0,0> {
+  static inline void power(vecP &C, const vec3 &dX) {
+    Kernels<0,0,nx-1>::power(C,dX);
+    C[Index<nx,0,0>::I] = C[Index<nx-1,0,0>::I] * dX[0] / nx;
+  }
+  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
+    static const int n = nx;
+    Kernels<0,0,nx-1>::derivative(C,dX,invR2);
+    C[Index<nx,0,0>::I] = DerivativeSum<nx,0,0>::loop(C,dX) / n * invR2;
+  }
+  static inline void scale(vecP &C) {
+    Kernels<0,0,nx-1>::scale(C);
+    C[Index<nx,0,0>::I] *= Index<nx,0,0>::F;
+  }
+  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
+    Kernels<0,0,nx-1>::M2M(MI,C,MJ);
+    MI[Index<nx,0,0>::I] += MultipoleSum<nx,0,0>::kernel(C,MJ);
+  }
+  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
+    Kernels<0,0,nx-1>::M2L(L,C,M);
+    L[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(M,C);
+  }
+  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
+    Kernels<0,0,nx-1>::L2L(LI,C,LJ);
+    LI[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(C,LJ);
+  }
+  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
+    Kernels<0,0,nx-1>::L2P(B,C,L);
+    B->TRG[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(C,L);
+  }
+};
+
+template<>
+struct Kernels<0,0,0> {
+  static inline void power(vecP&, const vec3&) {}
+  static inline void derivative(vecP&, const vec3&, const real_t&) {}
+  static inline void scale(vecP&) {}
+  static inline void M2M(vecP&, const vecP&, const vecP&) {}
+  static inline void M2L(vecP&, const vecP&, const vecP&) {}
+  static inline void L2L(vecP&, const vecP&, const vecP&) {}
+  static inline void L2P(B_iter, const vecP&, const vecP&) {}
+};
+
+
+template<int PP>
+inline void getCoef(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  C[0] = invR;
+  Kernels<0,0,PP>::derivative(C,dX,invR2);
+  Kernels<0,0,PP>::scale(C);
+}
+
+template<>
+inline void getCoef<1>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  C[0] = invR;
+  invR2 = -invR2;
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  C[1] = x * invR3;
+  C[2] = y * invR3;
+  C[3] = z * invR3;
+}
+
+template<>
+inline void getCoef<2>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  getCoef<1>(C,dX,invR2,invR);
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  real_t invR5 = 3 * invR3 * invR2;
+  real_t t = x * invR5;
+  C[4] = x * t + invR3;
+  C[5] = y * t;
+  C[6] = z * t;
+  t = y * invR5;
+  C[7] = y * t + invR3;
+  C[8] = z * t;
+  C[9] = z * z * invR5 + invR3;
+}
+template<>
+inline void getCoef<3>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  getCoef<2>(C,dX,invR2,invR);
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  real_t invR5 = 3 * invR3 * invR2;
+  real_t invR7 = 5 * invR5 * invR2;
+  real_t t = x * x * invR7;
+  C[10] = x * (t + 3 * invR5);
+  C[11] = y * (t +     invR5);
+  C[12] = z * (t +     invR5);
+  t = y * y * invR7;
+  C[13] = x * (t +     invR5);
+  C[16] = y * (t + 3 * invR5);
+  C[17] = z * (t +     invR5);
+  t = z * z * invR7;
+  C[15] = x * (t +     invR5);
+  C[18] = y * (t +     invR5);
+  C[19] = z * (t + 3 * invR5);
+  C[14] = x * y * z * invR7;
+}
+
+template<>
+inline void getCoef<4>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  getCoef<3>(C,dX,invR2,invR);
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  real_t invR5 = 3 * invR3 * invR2;
+  real_t invR7 = 5 * invR5 * invR2;
+  real_t invR9 = 7 * invR7 * invR2;
+  real_t t = x * x * invR9;
+  C[20] = x * x * (t + 6 * invR7) + 3 * invR5;
+  C[21] = x * y * (t + 3 * invR7);
+  C[22] = x * z * (t + 3 * invR7);
+  C[23] = y * y * (t +     invR7) + x * x * invR7 + invR5;
+  C[24] = y * z * (t +     invR7);
+  C[25] = z * z * (t +     invR7) + x * x * invR7 + invR5;
+  t = y * y * invR9;
+  C[26] = x * y * (t + 3 * invR7);
+  C[27] = x * z * (t +     invR7);
+  C[30] = y * y * (t + 6 * invR7) + 3 * invR5;
+  C[31] = y * z * (t + 3 * invR7);
+  C[32] = z * z * (t +     invR7) + y * y * invR7 + invR5;
+  t = z * z * invR9;
+  C[28] = x * y * (t +     invR7);
+  C[29] = x * z * (t + 3 * invR7);
+  C[33] = y * z * (t + 3 * invR7);
+  C[34] = z * z * (t + 6 * invR7) + 3 * invR5;
+}
+
+template<>
+inline void getCoef<5>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  getCoef<4>(C,dX,invR2,invR);
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  real_t invR5 = 3 * invR3 * invR2;
+  real_t invR7 = 5 * invR5 * invR2;
+  real_t invR9 = 7 * invR7 * invR2;
+  real_t invR11 = 9 * invR9 * invR2;
+  real_t t = x * x * invR11;
+  C[35] = x * x * x * (t + 10 * invR9) + 15 * x * invR7;
+  C[36] = x * x * y * (t +  6 * invR9) +  3 * y * invR7;
+  C[37] = x * x * z * (t +  6 * invR9) +  3 * z * invR7;
+  C[38] = x * y * y * (t +  3 * invR9) + x * x * x * invR9 + 3 * x * invR7;
+  C[39] = x * y * z * (t +  3 * invR9);
+  C[40] = x * z * z * (t +  3 * invR9) + x * x * x * invR9 + 3 * x * invR7;
+  C[41] = y * y * y * (t +      invR9) + 3 * x * x * y * invR9 + 3 * y * invR7;
+  C[42] = y * y * z * (t +      invR9) + x * x * z * invR9 + z * invR7;
+  C[43] = y * z * z * (t +      invR9) + x * x * y * invR9 + y * invR7;
+  C[44] = z * z * z * (t +      invR9) + 3 * x * x * z * invR9 + 3 * z * invR7;
+  t = y * y * invR11;
+  C[45] = x * y * y * (t +  6 * invR9) +  3 * x * invR7;
+  C[46] = x * y * z * (t +  3 * invR9);
+  C[47] = x * z * z * (t +      invR9) + x * y * y * invR9 + x * invR7;
+  C[50] = y * y * y * (t + 10 * invR9) + 15 * y * invR7;
+  C[51] = y * y * z * (t +  6 * invR9) + 3 * z * invR7;
+  C[52] = y * z * z * (t +  3 * invR9) + y * y * y * invR9 + 3 * y * invR7;
+  C[53] = z * z * z * (t +      invR9) + 3 * y * y * z * invR9 + 3 * z * invR7;
+  t = z * z * invR11;
+  C[48] = x * y * z * (t +  3 * invR9);
+  C[49] = x * z * z * (t +  6 * invR9) +  3 * x * invR7;
+  C[54] = y * z * z * (t +  6 * invR9) +  3 * y * invR7;
+  C[55] = z * z * z * (t + 10 * invR9) + 15 * z * invR7;
+}
+
+template<>
+inline void getCoef<6>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
+  getCoef<5>(C,dX,invR2,invR);
+  real_t x = dX[0], y = dX[1], z = dX[2];
+  real_t invR3 = invR * invR2;
+  real_t invR5 = 3 * invR3 * invR2;
+  real_t invR7 = 5 * invR5 * invR2;
+  real_t invR9 = 7 * invR7 * invR2;
+  real_t invR11 = 9 * invR9 * invR2;
+  real_t invR13 = 11 * invR11 * invR2;
+  real_t t = x * x * invR13;
+  C[56] = x * x * x * x * (t + 15 * invR11) + 45 * x * x * invR9 + 15 * invR7;
+  C[57] = x * x * x * y * (t + 10 * invR11) + 15 * x * y * invR9;
+  C[58] = x * x * x * z * (t + 10 * invR11) + 15 * x * z * invR9;
+  C[59] = x * x * y * y * (t +  6 * invR11) + x * x * x * x * invR11 + (6 * x * x + 3 * y * y) * invR9 + 3 * invR7;
+  C[60] = x * x * y * z * (t +  6 * invR11) + 3 * y * z * invR9;
+  C[61] = x * x * z * z * (t +  6 * invR11) + x * x * x * x * invR11 + (6 * x * x + 3 * z * z) * invR9 + 3 * invR7;
+  C[62] = x * y * y * y * (t +  3 * invR11) + 3 * x * x * x * y * invR11 + 9 * x * y * invR9;
+  C[63] = x * y * y * z * (t +  3 * invR11) + x * x * x * z * invR11 + 3 * x * z * invR9;
+  C[64] = x * y * z * z * (t +  3 * invR11) + x * x * x * y * invR11 + 3 * x * y * invR9;
+  C[65] = x * z * z * z * (t +  3 * invR11) + 3 * x * x * x * z * invR11 + 9 * x * z * invR9;
+  C[66] = y * y * y * y * (t +      invR11) + 6 * x * x * y * y * invR11 + (3 * x * x + 6 * y * y) * invR9 + 3 * invR7;
+  C[67] = y * y * y * z * (t +      invR11) + 3 * x * x * y * z * invR11 + 3 * y * z * invR9;
+  C[68] = y * y * z * z * (t +      invR11) + (x * x * y * y + x * x * z * z) * invR11 + (x * x + y * y + z * z) * invR9 + invR7;
+  C[69] = y * z * z * z * (t +      invR11) + 3 * x * x * y * z * invR11 + 3 * y * z * invR9;
+  C[70] = z * z * z * z * (t +      invR11) + 6 * x * x * z * z * invR11 + (3 * x * x + 6 * z * z) * invR9 + 3 * invR7;
+  t = y * y * invR13;
+  C[71] = x * y * y * y * (t + 10 * invR11) + 15 * x * y * invR9;
+  C[72] = x * y * y * z * (t +  6 * invR11) + 3 * x * z * invR9;
+  C[73] = x * y * z * z * (t +  3 * invR11) + x * y * y * y * invR11 + 3 * x * y * invR9;
+  C[74] = x * z * z * z * (t +      invR11) + 3 * x * y * y * z * invR11 + 3 * x * z * invR9;
+  C[77] = y * y * y * y * (t + 15 * invR11) + 45 * y * y * invR9 + 15 * invR7;
+  C[78] = y * y * y * z * (t + 10 * invR11) + 15 * y * z * invR9;
+  C[79] = y * y * z * z * (t +  6 * invR11) + y * y * y * y * invR11 + (6 * y * y + 3 * z * z) * invR9 + 3 * invR7;
+  C[80] = y * z * z * z * (t +  3 * invR11) + 3 * y * y * y * z * invR11 + 9 * y * z * invR9;
+  C[81] = z * z * z * z * (t +      invR11) + 6 * y * y * z * z * invR11 + (3 * y * y + 6 * z * z) * invR9 + 3 * invR7;
+  t = z * z * invR13;
+  C[75] = x * y * z * z * (t +  6 * invR11) + 3 * x * y * invR9;
+  C[76] = x * z * z * z * (t + 10 * invR11) + 15 * x * z * invR9;
+  C[82] = y * z * z * z * (t + 10 * invR11) + 15 * y * z * invR9;
+  C[83] = z * z * z * z * (t + 15 * invR11) + 45 * z * z * invR9 + 15 * invR7;
+}
+
+
+template<int PP>
+inline void sumM2L(vecP &L, const vecP &C, const vecP &M) {
+  for (int i=0; i<NTERM; i++) L[i] += C[i];
+  for (int i=1; i<NTERM; i++) L[0] += M[i] * C[i];
+  Kernels<0,0,PP-1>::M2L(L,C,M);
+}
+
+template<>
+inline void sumM2L<1>(vecP &L, const vecP &C, const vecP&) {
+  for (int i=0; i<NTERM; i++) L[i] += C[i];
+}
+
+template<>
+inline void sumM2L<2>(vecP &L, const vecP &C, const vecP &M) {
+  sumM2L<1>(L,C,M);
+  L[0] += M[1]*C[1]+M[2]*C[2]+M[3]*C[3];
+  L[1] += M[1]*C[4]+M[2]*C[5]+M[3]*C[6];
+  L[2] += M[1]*C[5]+M[2]*C[7]+M[3]*C[8];
+  L[3] += M[1]*C[6]+M[2]*C[8]+M[3]*C[9];
+}
+
+template<>
+inline void sumM2L<3>(vecP &L, const vecP &C, const vecP &M) {
+  sumM2L<2>(L,C,M);
+  L[0] += M[4]*C[4]+M[5]*C[5]+M[6]*C[6]+M[7]*C[7]+M[8]*C[8]+M[9]*C[9];
+  L[1] += M[4]*C[10]+M[5]*C[11]+M[6]*C[12]+M[7]*C[13]+M[8]*C[14]+M[9]*C[15];
+  L[2] += M[4]*C[11]+M[5]*C[13]+M[6]*C[14]+M[7]*C[16]+M[8]*C[17]+M[9]*C[18];
+  L[3] += M[4]*C[12]+M[5]*C[14]+M[6]*C[15]+M[7]*C[17]+M[8]*C[18]+M[9]*C[19];
+  L[4] += M[1]*C[10]+M[2]*C[11]+M[3]*C[12];
+  L[5] += M[1]*C[11]+M[2]*C[13]+M[3]*C[14];
+  L[6] += M[1]*C[12]+M[2]*C[14]+M[3]*C[15];
+  L[7] += M[1]*C[13]+M[2]*C[16]+M[3]*C[17];
+  L[8] += M[1]*C[14]+M[2]*C[17]+M[3]*C[18];
+  L[9] += M[1]*C[15]+M[2]*C[18]+M[3]*C[19];
+}
+
+template<>
+inline void sumM2L<4>(vecP &L, const vecP &C, const vecP &M) {
+  sumM2L<3>(L,C,M);
+  L[0] += M[10]*C[10]+M[11]*C[11]+M[12]*C[12]+M[13]*C[13]+M[14]*C[14]+M[15]*C[15]+M[16]*C[16]+M[17]*C[17]+M[18]*C[18]+M[19]*C[19];
+  L[1] += M[10]*C[20]+M[11]*C[21]+M[12]*C[22]+M[13]*C[23]+M[14]*C[24]+M[15]*C[25]+M[16]*C[26]+M[17]*C[27]+M[18]*C[28]+M[19]*C[29];
+  L[2] += M[10]*C[21]+M[11]*C[23]+M[12]*C[24]+M[13]*C[26]+M[14]*C[27]+M[15]*C[28]+M[16]*C[30]+M[17]*C[31]+M[18]*C[32]+M[19]*C[33];
+  L[3] += M[10]*C[22]+M[11]*C[24]+M[12]*C[25]+M[13]*C[27]+M[14]*C[28]+M[15]*C[29]+M[16]*C[31]+M[17]*C[32]+M[18]*C[33]+M[19]*C[34];
+  L[4] += M[4]*C[20]+M[5]*C[21]+M[6]*C[22]+M[7]*C[23]+M[8]*C[24]+M[9]*C[25];
+  L[5] += M[4]*C[21]+M[5]*C[23]+M[6]*C[24]+M[7]*C[26]+M[8]*C[27]+M[9]*C[28];
+  L[6] += M[4]*C[22]+M[5]*C[24]+M[6]*C[25]+M[7]*C[27]+M[8]*C[28]+M[9]*C[29];
+  L[7] += M[4]*C[23]+M[5]*C[26]+M[6]*C[27]+M[7]*C[30]+M[8]*C[31]+M[9]*C[32];
+  L[8] += M[4]*C[24]+M[5]*C[27]+M[6]*C[28]+M[7]*C[31]+M[8]*C[32]+M[9]*C[33];
+  L[9] += M[4]*C[25]+M[5]*C[28]+M[6]*C[29]+M[7]*C[32]+M[8]*C[33]+M[9]*C[34];
+  L[10] += M[1]*C[20]+M[2]*C[21]+M[3]*C[22];
+  L[11] += M[1]*C[21]+M[2]*C[23]+M[3]*C[24];
+  L[12] += M[1]*C[22]+M[2]*C[24]+M[3]*C[25];
+  L[13] += M[1]*C[23]+M[2]*C[26]+M[3]*C[27];
+  L[14] += M[1]*C[24]+M[2]*C[27]+M[3]*C[28];
+  L[15] += M[1]*C[25]+M[2]*C[28]+M[3]*C[29];
+  L[16] += M[1]*C[26]+M[2]*C[30]+M[3]*C[31];
+  L[17] += M[1]*C[27]+M[2]*C[31]+M[3]*C[32];
+  L[18] += M[1]*C[28]+M[2]*C[32]+M[3]*C[33];
+  L[19] += M[1]*C[29]+M[2]*C[33]+M[3]*C[34];
+}
+
+template<>
+inline void sumM2L<5>(vecP &L, const vecP &C, const vecP &M) {
+  sumM2L<4>(L,C,M);
+  L[0] += M[20]*C[20]+M[21]*C[21]+M[22]*C[22]+M[23]*C[23]+M[24]*C[24]+M[25]*C[25]+M[26]*C[26]+M[27]*C[27]+M[28]*C[28]+M[29]*C[29]+M[30]*C[30]+M[31]*C[31]+M[32]*C[32]+M[33]*C[33]+M[34]*C[34];
+  L[1] += M[20]*C[35]+M[21]*C[36]+M[22]*C[37]+M[23]*C[38]+M[24]*C[39]+M[25]*C[40]+M[26]*C[41]+M[27]*C[42]+M[28]*C[43]+M[29]*C[44]+M[30]*C[45]+M[31]*C[46]+M[32]*C[47]+M[33]*C[48]+M[34]*C[49];
+  L[2] += M[20]*C[36]+M[21]*C[38]+M[22]*C[39]+M[23]*C[41]+M[24]*C[42]+M[25]*C[43]+M[26]*C[45]+M[27]*C[46]+M[28]*C[47]+M[29]*C[48]+M[30]*C[50]+M[31]*C[51]+M[32]*C[52]+M[33]*C[53]+M[34]*C[54];
+  L[3] += M[20]*C[37]+M[21]*C[39]+M[22]*C[40]+M[23]*C[42]+M[24]*C[43]+M[25]*C[44]+M[26]*C[46]+M[27]*C[47]+M[28]*C[48]+M[29]*C[49]+M[30]*C[51]+M[31]*C[52]+M[32]*C[53]+M[33]*C[54]+M[34]*C[55];
+  L[4] += M[10]*C[35]+M[11]*C[36]+M[12]*C[37]+M[13]*C[38]+M[14]*C[39]+M[15]*C[40]+M[16]*C[41]+M[17]*C[42]+M[18]*C[43]+M[19]*C[44];
+  L[5] += M[10]*C[36]+M[11]*C[38]+M[12]*C[39]+M[13]*C[41]+M[14]*C[42]+M[15]*C[43]+M[16]*C[45]+M[17]*C[46]+M[18]*C[47]+M[19]*C[48];
+  L[6] += M[10]*C[37]+M[11]*C[39]+M[12]*C[40]+M[13]*C[42]+M[14]*C[43]+M[15]*C[44]+M[16]*C[46]+M[17]*C[47]+M[18]*C[48]+M[19]*C[49];
+  L[7] += M[10]*C[38]+M[11]*C[41]+M[12]*C[42]+M[13]*C[45]+M[14]*C[46]+M[15]*C[47]+M[16]*C[50]+M[17]*C[51]+M[18]*C[52]+M[19]*C[53];
+  L[8] += M[10]*C[39]+M[11]*C[42]+M[12]*C[43]+M[13]*C[46]+M[14]*C[47]+M[15]*C[48]+M[16]*C[51]+M[17]*C[52]+M[18]*C[53]+M[19]*C[54];
+  L[9] += M[10]*C[40]+M[11]*C[43]+M[12]*C[44]+M[13]*C[47]+M[14]*C[48]+M[15]*C[49]+M[16]*C[52]+M[17]*C[53]+M[18]*C[54]+M[19]*C[55];
+  L[10] += M[4]*C[35]+M[5]*C[36]+M[6]*C[37]+M[7]*C[38]+M[8]*C[39]+M[9]*C[40];
+  L[11] += M[4]*C[36]+M[5]*C[38]+M[6]*C[39]+M[7]*C[41]+M[8]*C[42]+M[9]*C[43];
+  L[12] += M[4]*C[37]+M[5]*C[39]+M[6]*C[40]+M[7]*C[42]+M[8]*C[43]+M[9]*C[44];
+  L[13] += M[4]*C[38]+M[5]*C[41]+M[6]*C[42]+M[7]*C[45]+M[8]*C[46]+M[9]*C[47];
+  L[14] += M[4]*C[39]+M[5]*C[42]+M[6]*C[43]+M[7]*C[46]+M[8]*C[47]+M[9]*C[48];
+  L[15] += M[4]*C[40]+M[5]*C[43]+M[6]*C[44]+M[7]*C[47]+M[8]*C[48]+M[9]*C[49];
+  L[16] += M[4]*C[41]+M[5]*C[45]+M[6]*C[46]+M[7]*C[50]+M[8]*C[51]+M[9]*C[52];
+  L[17] += M[4]*C[42]+M[5]*C[46]+M[6]*C[47]+M[7]*C[51]+M[8]*C[52]+M[9]*C[53];
+  L[18] += M[4]*C[43]+M[5]*C[47]+M[6]*C[48]+M[7]*C[52]+M[8]*C[53]+M[9]*C[54];
+  L[19] += M[4]*C[44]+M[5]*C[48]+M[6]*C[49]+M[7]*C[53]+M[8]*C[54]+M[9]*C[55];
+  L[20] += M[1]*C[35]+M[2]*C[36]+M[3]*C[37];
+  L[21] += M[1]*C[36]+M[2]*C[38]+M[3]*C[39];
+  L[22] += M[1]*C[37]+M[2]*C[39]+M[3]*C[40];
+  L[23] += M[1]*C[38]+M[2]*C[41]+M[3]*C[42];
+  L[24] += M[1]*C[39]+M[2]*C[42]+M[3]*C[43];
+  L[25] += M[1]*C[40]+M[2]*C[43]+M[3]*C[44];
+  L[26] += M[1]*C[41]+M[2]*C[45]+M[3]*C[46];
+  L[27] += M[1]*C[42]+M[2]*C[46]+M[3]*C[47];
+  L[28] += M[1]*C[43]+M[2]*C[47]+M[3]*C[48];
+  L[29] += M[1]*C[44]+M[2]*C[48]+M[3]*C[49];
+  L[30] += M[1]*C[45]+M[2]*C[50]+M[3]*C[51];
+  L[31] += M[1]*C[46]+M[2]*C[51]+M[3]*C[52];
+  L[32] += M[1]*C[47]+M[2]*C[52]+M[3]*C[53];
+  L[33] += M[1]*C[48]+M[2]*C[53]+M[3]*C[54];
+  L[34] += M[1]*C[49]+M[2]*C[54]+M[3]*C[55];
+}
+
+template<>
+inline void sumM2L<6>(vecP &L, const vecP &C, const vecP &M) {
+  sumM2L<5>(L,C,M);
+  L[0] += M[35]*C[35]+M[36]*C[36]+M[37]*C[37]+M[38]*C[38]+M[39]*C[39]+M[40]*C[40]+M[41]*C[41]+M[42]*C[42]+M[43]*C[43]+M[44]*C[44]+M[45]*C[45]+M[46]*C[46]+M[47]*C[47]+M[48]*C[48]+M[49]*C[49]+M[50]*C[50]+M[51]*C[51]+M[52]*C[52]+M[53]*C[53]+M[54]*C[54]+M[55]*C[55];
+  L[1] += M[35]*C[56]+M[36]*C[57]+M[37]*C[58]+M[38]*C[59]+M[39]*C[60]+M[40]*C[61]+M[41]*C[62]+M[42]*C[63]+M[43]*C[64]+M[44]*C[65]+M[45]*C[66]+M[46]*C[67]+M[47]*C[68]+M[48]*C[69]+M[49]*C[70]+M[50]*C[71]+M[51]*C[72]+M[52]*C[73]+M[53]*C[74]+M[54]*C[75]+M[55]*C[76];
+  L[2] += M[35]*C[57]+M[36]*C[59]+M[37]*C[60]+M[38]*C[62]+M[39]*C[63]+M[40]*C[64]+M[41]*C[66]+M[42]*C[67]+M[43]*C[68]+M[44]*C[69]+M[45]*C[71]+M[46]*C[72]+M[47]*C[73]+M[48]*C[74]+M[49]*C[75]+M[50]*C[77]+M[51]*C[78]+M[52]*C[79]+M[53]*C[80]+M[54]*C[81]+M[55]*C[82];
+  L[3] += M[35]*C[58]+M[36]*C[60]+M[37]*C[61]+M[38]*C[63]+M[39]*C[64]+M[40]*C[65]+M[41]*C[67]+M[42]*C[68]+M[43]*C[69]+M[44]*C[70]+M[45]*C[72]+M[46]*C[73]+M[47]*C[74]+M[48]*C[75]+M[49]*C[76]+M[50]*C[78]+M[51]*C[79]+M[52]*C[80]+M[53]*C[81]+M[54]*C[82]+M[55]*C[83];
+  L[4] += M[20]*C[56]+M[21]*C[57]+M[22]*C[58]+M[23]*C[59]+M[24]*C[60]+M[25]*C[61]+M[26]*C[62]+M[27]*C[63]+M[28]*C[64]+M[29]*C[65]+M[30]*C[66]+M[31]*C[67]+M[32]*C[68]+M[33]*C[69]+M[34]*C[70];
+  L[5] += M[20]*C[57]+M[21]*C[59]+M[22]*C[60]+M[23]*C[62]+M[24]*C[63]+M[25]*C[64]+M[26]*C[66]+M[27]*C[67]+M[28]*C[68]+M[29]*C[69]+M[30]*C[71]+M[31]*C[72]+M[32]*C[73]+M[33]*C[74]+M[34]*C[75];
+  L[6] += M[20]*C[58]+M[21]*C[60]+M[22]*C[61]+M[23]*C[63]+M[24]*C[64]+M[25]*C[65]+M[26]*C[67]+M[27]*C[68]+M[28]*C[69]+M[29]*C[70]+M[30]*C[72]+M[31]*C[73]+M[32]*C[74]+M[33]*C[75]+M[34]*C[76];
+  L[7] += M[20]*C[59]+M[21]*C[62]+M[22]*C[63]+M[23]*C[66]+M[24]*C[67]+M[25]*C[68]+M[26]*C[71]+M[27]*C[72]+M[28]*C[73]+M[29]*C[74]+M[30]*C[77]+M[31]*C[78]+M[32]*C[79]+M[33]*C[80]+M[34]*C[81];
+  L[8] += M[20]*C[60]+M[21]*C[63]+M[22]*C[64]+M[23]*C[67]+M[24]*C[68]+M[25]*C[69]+M[26]*C[72]+M[27]*C[73]+M[28]*C[74]+M[29]*C[75]+M[30]*C[78]+M[31]*C[79]+M[32]*C[80]+M[33]*C[81]+M[34]*C[82];
+  L[9] += M[20]*C[61]+M[21]*C[64]+M[22]*C[65]+M[23]*C[68]+M[24]*C[69]+M[25]*C[70]+M[26]*C[73]+M[27]*C[74]+M[28]*C[75]+M[29]*C[76]+M[30]*C[79]+M[31]*C[80]+M[32]*C[81]+M[33]*C[82]+M[34]*C[83];
+  L[10] += M[10]*C[56]+M[11]*C[57]+M[12]*C[58]+M[13]*C[59]+M[14]*C[60]+M[15]*C[61]+M[16]*C[62]+M[17]*C[63]+M[18]*C[64]+M[19]*C[65];
+  L[11] += M[10]*C[57]+M[11]*C[59]+M[12]*C[60]+M[13]*C[62]+M[14]*C[63]+M[15]*C[64]+M[16]*C[66]+M[17]*C[67]+M[18]*C[68]+M[19]*C[69];
+  L[12] += M[10]*C[58]+M[11]*C[60]+M[12]*C[61]+M[13]*C[63]+M[14]*C[64]+M[15]*C[65]+M[16]*C[67]+M[17]*C[68]+M[18]*C[69]+M[19]*C[70];
+  L[13] += M[10]*C[59]+M[11]*C[62]+M[12]*C[63]+M[13]*C[66]+M[14]*C[67]+M[15]*C[68]+M[16]*C[71]+M[17]*C[72]+M[18]*C[73]+M[19]*C[74];
+  L[14] += M[10]*C[60]+M[11]*C[63]+M[12]*C[64]+M[13]*C[67]+M[14]*C[68]+M[15]*C[69]+M[16]*C[72]+M[17]*C[73]+M[18]*C[74]+M[19]*C[75];
+  L[15] += M[10]*C[61]+M[11]*C[64]+M[12]*C[65]+M[13]*C[68]+M[14]*C[69]+M[15]*C[70]+M[16]*C[73]+M[17]*C[74]+M[18]*C[75]+M[19]*C[76];
+  L[16] += M[10]*C[62]+M[11]*C[66]+M[12]*C[67]+M[13]*C[71]+M[14]*C[72]+M[15]*C[73]+M[16]*C[77]+M[17]*C[78]+M[18]*C[79]+M[19]*C[80];
+  L[17] += M[10]*C[63]+M[11]*C[67]+M[12]*C[68]+M[13]*C[72]+M[14]*C[73]+M[15]*C[74]+M[16]*C[78]+M[17]*C[79]+M[18]*C[80]+M[19]*C[81];
+  L[18] += M[10]*C[64]+M[11]*C[68]+M[12]*C[69]+M[13]*C[73]+M[14]*C[74]+M[15]*C[75]+M[16]*C[79]+M[17]*C[80]+M[18]*C[81]+M[19]*C[82];
+  L[19] += M[10]*C[65]+M[11]*C[69]+M[12]*C[70]+M[13]*C[74]+M[14]*C[75]+M[15]*C[76]+M[16]*C[80]+M[17]*C[81]+M[18]*C[82]+M[19]*C[83];
+  L[20] += M[4]*C[56]+M[5]*C[57]+M[6]*C[58]+M[7]*C[59]+M[8]*C[60]+M[9]*C[61];
+  L[21] += M[4]*C[57]+M[5]*C[59]+M[6]*C[60]+M[7]*C[62]+M[8]*C[63]+M[9]*C[64];
+  L[22] += M[4]*C[58]+M[5]*C[60]+M[6]*C[61]+M[7]*C[63]+M[8]*C[64]+M[9]*C[65];
+  L[23] += M[4]*C[59]+M[5]*C[62]+M[6]*C[63]+M[7]*C[66]+M[8]*C[67]+M[9]*C[68];
+  L[24] += M[4]*C[60]+M[5]*C[63]+M[6]*C[64]+M[7]*C[67]+M[8]*C[68]+M[9]*C[69];
+  L[25] += M[4]*C[61]+M[5]*C[64]+M[6]*C[65]+M[7]*C[68]+M[8]*C[69]+M[9]*C[70];
+  L[26] += M[4]*C[62]+M[5]*C[66]+M[6]*C[67]+M[7]*C[71]+M[8]*C[72]+M[9]*C[73];
+  L[27] += M[4]*C[63]+M[5]*C[67]+M[6]*C[68]+M[7]*C[72]+M[8]*C[73]+M[9]*C[74];
+  L[28] += M[4]*C[64]+M[5]*C[68]+M[6]*C[69]+M[7]*C[73]+M[8]*C[74]+M[9]*C[75];
+  L[29] += M[4]*C[65]+M[5]*C[69]+M[6]*C[70]+M[7]*C[74]+M[8]*C[75]+M[9]*C[76];
+  L[30] += M[4]*C[66]+M[5]*C[71]+M[6]*C[72]+M[7]*C[77]+M[8]*C[78]+M[9]*C[79];
+  L[31] += M[4]*C[67]+M[5]*C[72]+M[6]*C[73]+M[7]*C[78]+M[8]*C[79]+M[9]*C[80];
+  L[32] += M[4]*C[68]+M[5]*C[73]+M[6]*C[74]+M[7]*C[79]+M[8]*C[80]+M[9]*C[81];
+  L[33] += M[4]*C[69]+M[5]*C[74]+M[6]*C[75]+M[7]*C[80]+M[8]*C[81]+M[9]*C[82];
+  L[34] += M[4]*C[70]+M[5]*C[75]+M[6]*C[76]+M[7]*C[81]+M[8]*C[82]+M[9]*C[83];
+  L[35] += M[1]*C[56]+M[2]*C[57]+M[3]*C[58];
+  L[36] += M[1]*C[57]+M[2]*C[59]+M[3]*C[60];
+  L[37] += M[1]*C[58]+M[2]*C[60]+M[3]*C[61];
+  L[38] += M[1]*C[59]+M[2]*C[62]+M[3]*C[63];
+  L[39] += M[1]*C[60]+M[2]*C[63]+M[3]*C[64];
+  L[40] += M[1]*C[61]+M[2]*C[64]+M[3]*C[65];
+  L[41] += M[1]*C[62]+M[2]*C[66]+M[3]*C[67];
+  L[42] += M[1]*C[63]+M[2]*C[67]+M[3]*C[68];
+  L[43] += M[1]*C[64]+M[2]*C[68]+M[3]*C[69];
+  L[44] += M[1]*C[65]+M[2]*C[69]+M[3]*C[70];
+  L[45] += M[1]*C[66]+M[2]*C[71]+M[3]*C[72];
+  L[46] += M[1]*C[67]+M[2]*C[72]+M[3]*C[73];
+  L[47] += M[1]*C[68]+M[2]*C[73]+M[3]*C[74];
+  L[48] += M[1]*C[69]+M[2]*C[74]+M[3]*C[75];
+  L[49] += M[1]*C[70]+M[2]*C[75]+M[3]*C[76];
+  L[50] += M[1]*C[71]+M[2]*C[77]+M[3]*C[78];
+  L[51] += M[1]*C[72]+M[2]*C[78]+M[3]*C[79];
+  L[52] += M[1]*C[73]+M[2]*C[79]+M[3]*C[80];
+  L[53] += M[1]*C[74]+M[2]*C[80]+M[3]*C[81];
+  L[54] += M[1]*C[75]+M[2]*C[81]+M[3]*C[82];
+  L[55] += M[1]*C[76]+M[2]*C[82]+M[3]*C[83];
+}
+
+
+template<int PP, bool odd>
+struct Coefs {
+  static const int begin = PP*(PP+1)*(PP+2)/6;
+  static const int end = (PP+1)*(PP+2)*(PP+3)/6;
+  static inline void negate(vecP &C) {
+    for (int i=begin; i<end; i++) C[i] = -C[i];
+    Coefs<PP-1,1-odd>::negate(C);
+  }
+};
+
+template<int PP>
+struct Coefs<PP,0> {
+  static inline void negate(vecP &C) {
+    Coefs<PP-1,1>::negate(C);
+  }
+};
+
+template<>
+struct Coefs<0,0> {
+  static inline void negate(vecP){}
+};
+
+void Kernel::P2M(C_iter C) const {
+  for (B_iter B=C->BODY; B!=C->BODY+C->NCBODY; B++) {
+    vec3 dX = C->X - B->X;
+    real_t R = std::sqrt(norm(dX));
+    if (R > C->RMAX) C->RMAX = R;
+    vecP M;
+    M[0] = B->SRC;
+    Kernels<0,0,P-1>::power(M,dX);
+    for (int i=0; i<NTERM; i++) C->M[i] += M[i];
+  }
+  if (C->NCBODY != 0 && C->M[0] == 0) C->M[0] = EPS;
+#if USE_RMAX
+  C->RCRIT = std::min(C->R,C->RMAX);
+#else
+  C->RCRIT = C->R;
+#endif
+}
+
+void Kernel::M2M(C_iter Ci, C_iter C0) const {
+  for (C_iter Cj=C0+Ci->CHILD; Cj!=C0+Ci->CHILD+Ci->NCHILD; Cj++) {
+    vec3 dX = Ci->X - Cj->X;
+    real_t R = std::sqrt(norm(dX)) + Cj->RCRIT;
+    if (R > Ci->RMAX) Ci->RMAX = R;
+    vecP M;
+    vecP C;
+    C[0] = 1;
+    Kernels<0,0,P-1>::power(C,dX);
+    M = Cj->M;
+    for (int i=0; i<NTERM; i++) Ci->M[i] += C[i] * M[0];
+    Kernels<0,0,P-1>::M2M(Ci->M,C,M);
+  }
+  if (Ci->NCHILD != 0 && Ci->M[0] == 0) Ci->M[0] = EPS;
+#if USE_RMAX
+  Ci->RCRIT = std::min(Ci->R,Ci->RMAX);
+#else
+  Ci->RCRIT = Ci->R;
+#endif
+}
+
+void Kernel::M2L(C_iter Ci, C_iter Cj, bool mutual) const {
+  vec3 dX = Ci->X - Cj->X - Xperiodic;
+  real_t invR2 = 1 / norm(dX);
+  real_t invR  = Ci->M[0] * Cj->M[0] * std::sqrt(invR2);
+  vecP C;
+  getCoef<P-1>(C,dX,invR2,invR);
+  sumM2L<P-1>(Ci->L,C,Cj->M);
+  if (mutual) {
+    Coefs<P-1,(P-1)&1>::negate(C);
+    sumM2L<P-1>(Cj->L,C,Ci->M);
+  }
+}
+
+void Kernel::L2L(C_iter Ci, C_iter Ci0) const {
+  C_iter Cj = Ci0 + Ci->PARENT;
+  vec3 dX = Ci->X - Cj->X;
+  vecP C;
+  C[0] = 1;
+  Kernels<0,0,P-1>::power(C,dX);
+  Ci->L /= Ci->M[0];
+  Ci->L += Cj->L;
+  for (int i=1; i<NTERM; i++) Ci->L[0] += C[i] * Cj->L[i];
+  Kernels<0,0,P-1>::L2L(Ci->L,C,Cj->L);
+}
+
+void Kernel::L2P(C_iter Ci) const {
+  for (B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; B++) {
+    vec3 dX = B->X - Ci->X;
+    vecP C, L;
+    C[0] = 1;
+    Kernels<0,0,P-1>::power(C,dX);
+    L = Ci->L;
+    B->TRG /= B->SRC;
+    B->TRG[0] += L[0];
+    B->TRG[1] += L[1];
+    B->TRG[2] += L[2];
+    B->TRG[3] += L[3];
+    for (int i=1; i<NTERM; i++) B->TRG[0] += C[i] * L[i];
+    Kernels<0,0,1>::L2P(B,C,L);
+  }
+}

kernels/3DLaplaceP2PCPU.cxx

+#include "kernel.h"
+#include "simd.h"
+const real_t EPS2 = 0.0;                                        //!< Softening parameter (squared)
+
+void Kernel::P2P(C_iter Ci, C_iter Cj, bool mutual) const {
+  B_iter Bi = Ci->BODY;
+  B_iter Bj = Cj->BODY;
+  int ni = Ci->NDBODY;
+  int nj = Cj->NDBODY;
+  int i = 0;
+#if USE_SIMD
+  for ( ; i<=ni-NSIMD; i+=NSIMD) {
+    simdvec zero = 0;
+    ksimdvec pot = zero;
+    ksimdvec ax = zero;
+    ksimdvec ay = zero;
+    ksimdvec az = zero;
+
+    simdvec xi = SIMD<simdvec,0,NSIMD>::setBody(Bi,i);
+    simdvec yi = SIMD<simdvec,1,NSIMD>::setBody(Bi,i);
+    simdvec zi = SIMD<simdvec,2,NSIMD>::setBody(Bi,i);
+    simdvec mi = SIMD<simdvec,3,NSIMD>::setBody(Bi,i);
+    simdvec R2 = EPS2;
+
+    simdvec xj = Xperiodic[0];
+    xi -= xj;
+    simdvec yj = Xperiodic[1];
+    yi -= yj;
+    simdvec zj = Xperiodic[2];
+    zi -= zj;
+
+    simdvec x2 = Bj[0].X[0];
+    x2 -= xi;
+    simdvec y2 = Bj[0].X[1];
+    y2 -= yi;
+    simdvec z2 = Bj[0].X[2];
+    z2 -= zi;
+    simdvec mj = Bj[0].SRC;
+
+    xj = x2;
+    R2 += x2 * x2;
+    yj = y2;
+    R2 += y2 * y2;
+    zj = z2;
+    R2 += z2 * z2;
+    simdvec invR;
+
+    x2 = Bj[1].X[0];
+    y2 = Bj[1].X[1];
+    z2 = Bj[1].X[2];
+    for (int j=0; j<nj-2; j++) {
+      invR = rsqrt(R2);
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      if (mutual) Bj[j].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = Bj[j+1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      if (mutual) Bj[j].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+      x2 = Bj[j+2].X[0];
+
+      yj *= invR;
+      ay += yj;
+      if (mutual) Bj[j].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+      y2 = Bj[j+2].X[1];
+
+      zj *= invR;
+      az += zj;
+      if (mutual) Bj[j].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+      z2 = Bj[j+2].X[2];
+    }
+    if ( nj > 1 ) {
+      invR = rsqrt(R2);
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      if (mutual) Bj[nj-2].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = Bj[nj-1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      if (mutual) Bj[nj-2].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+
+      yj *= invR;
+      ay += yj;
+      if (mutual) Bj[nj-2].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+
+      zj *= invR;
+      az += zj;
+      if (mutual) Bj[nj-2].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+    }
+    invR = rsqrt(R2);
+    invR &= R2 > zero;
+    mj *= invR * mi;
+    pot += mj;
+    if (mutual) Bj[nj-1].TRG[0] += sum(mj);
+    invR = invR * invR * mj;
+
+    xj *= invR;
+    ax += xj;
+    if (mutual) Bj[nj-1].TRG[1] -= sum(xj);
+    yj *= invR;
+    ay += yj;
+    if (mutual) Bj[nj-1].TRG[2] -= sum(yj);
+    zj *= invR;
+    az += zj;
+    if (mutual) Bj[nj-1].TRG[3] -= sum(zj);
+    for (int k=0; k<NSIMD; k++) {
+      Bi[i+k].TRG[0] += transpose(pot,k);
+      Bi[i+k].TRG[1] += transpose(ax,k);
+      Bi[i+k].TRG[2] += transpose(ay,k);
+      Bi[i+k].TRG[3] += transpose(az,k);
+    }
+  }
+#endif
+  for ( ; i<ni; i++) {
+    kreal_t pot = 0; 
+    kreal_t ax = 0;
+    kreal_t ay = 0;
+    kreal_t az = 0;
+    for (int j=0; j<nj; j++) {
+      vec3 dX = Bi[i].X - Bj[j].X - Xperiodic;
+      real_t R2 = norm(dX) + EPS2;
+      if (R2 != 0) {
+        real_t invR2 = 1.0 / R2;
+        real_t invR = Bi[i].SRC * Bj[j].SRC * sqrt(invR2);
+        dX *= invR2 * invR;
+        pot += invR;
+        ax += dX[0];
+        ay += dX[1];
+        az += dX[2];
+        if (mutual) {
+          Bj[j].TRG[0] += invR;
+          Bj[j].TRG[1] += dX[0];
+          Bj[j].TRG[2] += dX[1];
+          Bj[j].TRG[3] += dX[2];
+        }
+      }
+    }
+    Bi[i].TRG[0] += pot;
+    Bi[i].TRG[1] -= ax;
+    Bi[i].TRG[2] -= ay;
+    Bi[i].TRG[3] -= az;
+  }
+}
+
+void Kernel::P2P(C_iter C) const {
+  B_iter B = C->BODY;
+  int n = C->NDBODY;
+  int i = 0;
+#if USE_SIMD
+  for ( ; i<=n-NSIMD; i+=NSIMD) {
+    simdvec zero = 0;
+    ksimdvec pot = zero;
+    ksimdvec ax = zero;
+    ksimdvec ay = zero;
+    ksimdvec az = zero;
+
+    simdvec index = SIMD<simdvec,0,NSIMD>::setIndex(i);
+    simdvec xi = SIMD<simdvec,0,NSIMD>::setBody(B,i);
+    simdvec yi = SIMD<simdvec,1,NSIMD>::setBody(B,i);
+    simdvec zi = SIMD<simdvec,2,NSIMD>::setBody(B,i);
+    simdvec mi = SIMD<simdvec,3,NSIMD>::setBody(B,i);
+    simdvec R2 = EPS2;
+
+    simdvec xj = Xperiodic[0];
+    xi -= xj;
+    simdvec yj = Xperiodic[1];
+    yi -= yj;
+    simdvec zj = Xperiodic[2];
+    zi -= zj;
+
+    simdvec x2 = B[i+1].X[0];
+    x2 -= xi;
+    simdvec y2 = B[i+1].X[1];
+    y2 -= yi;
+    simdvec z2 = B[i+1].X[2];
+    z2 -= zi;
+    simdvec mj = B[i+1].SRC;
+
+    xj = x2;
+    R2 += x2 * x2;
+    yj = y2;
+    R2 += y2 * y2;
+    zj = z2;
+    R2 += z2 * z2;
+    simdvec invR;
+
+    x2 = B[i+2].X[0];
+    y2 = B[i+2].X[1];
+    z2 = B[i+2].X[2];
+    for (int j=i+1; j<n-2; j++) {
+      invR = rsqrt(R2);
+      invR &= index < j;
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      B[j].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = B[j+1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      B[j].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+      x2 = B[j+2].X[0];
+
+      yj *= invR;
+      ay += yj;
+      B[j].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+      y2 = B[j+2].X[1];
+
+      zj *= invR;
+      az += zj;
+      B[j].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+      z2 = B[j+2].X[2];
+    }
+    if ( n-2 > i ) {
+      invR = rsqrt(R2);
+      invR &= index < n-2;
+      invR &= R2 > zero;
+      R2 = EPS2;
+      x2 -= xi;
+      y2 -= yi;
+      z2 -= zi;
+
+      mj *= invR * mi;
+      pot += mj;
+      B[n-2].TRG[0] += sum(mj);
+      invR = invR * invR * mj;
+      mj = B[n-1].SRC;
+
+      xj *= invR;
+      ax += xj;
+      B[n-2].TRG[1] -= sum(xj);
+      xj = x2;
+      R2 += x2 * x2;
+
+      yj *= invR;
+      ay += yj;
+      B[n-2].TRG[2] -= sum(yj);
+      yj = y2;
+      R2 += y2 * y2;
+
+      zj *= invR;
+      az += zj;
+      B[n-2].TRG[3] -= sum(zj);
+      zj = z2;
+      R2 += z2 * z2;
+    }
+    invR = rsqrt(R2);
+    invR &= index < n-1;
+    invR &= R2 > zero;
+    mj *= invR * mi;
+    pot += mj;
+    B[n-1].TRG[0] += sum(mj);
+    invR = invR * invR * mj;
+
+    xj *= invR;
+    ax += xj;
+    B[n-1].TRG[1] -= sum(xj);
+    yj *= invR;
+    ay += yj;
+    B[n-1].TRG[2] -= sum(yj);
+    zj *= invR;
+    az += zj;
+    B[n-1].TRG[3] -= sum(zj);
+    for (int k=0; k<NSIMD; k++) {
+      B[i+k].TRG[0] += transpose(pot,k);
+      B[i+k].TRG[1] += transpose(ax,k);
+      B[i+k].TRG[2] += transpose(ay,k);
+      B[i+k].TRG[3] += transpose(az,k);
+    }
+  }
+#endif
+  for ( ; i<n; i++) {
+    kreal_t pot = 0;
+    kreal_t ax = 0;
+    kreal_t ay = 0;
+    kreal_t az = 0;
+    for (int j=i+1; j<n; j++) {
+      vec3 dX = B[i].X - B[j].X;
+      real_t R2 = norm(dX) + EPS2;
+      if (R2 != 0) {
+        real_t invR2 = 1.0 / R2;
+        real_t invR = B[i].SRC * B[j].SRC * sqrt(invR2);
+        dX *= invR2 * invR;
+        pot += invR;
+        ax += dX[0];
+        ay += dX[1];
+        az += dX[2];
+        B[j].TRG[0] += invR;
+        B[j].TRG[1] += dX[0];
+        B[j].TRG[2] += dX[1];
+        B[j].TRG[3] += dX[2];
+      }
+    }
+    B[i].TRG[0] += pot;
+    B[i].TRG[1] -= ax;
+    B[i].TRG[2] -= ay;
+    B[i].TRG[3] -= az;
+  }
+}

kernels/3DLaplaceSphericalCPU.cxx

+#include "kernel.h"
+
+#define ODDEVEN(n) ((((n) & 1) == 1) ? -1 : 1)
+#define IPOW2N(n) ((n >= 0) ? 1 : ODDEVEN(n))
+
+const complex_t I(0.,1.);                                       // Imaginary unit
+const real_t EPS = 1e-12;                                       // Double precision epsilon
+
+//! Get r,theta,phi from x,y,z
+void cart2sph(real_t& r, real_t& theta, real_t& phi, vec3 dX) {
+  r = sqrt(norm(dX)) * 1.000001;                                // r = sqrt(x^2 + y^2 + z^2)
+  theta = acos(dX[2] / r);                                      // theta = acos(z / r)
+  phi = atan2(dX[1],dX[0]);                                     // phi = atan(y / x)
+}
+
+//! Spherical to cartesian coordinates
+template<typename T>
+void sph2cart(real_t r, real_t theta, real_t phi, T spherical, T &cartesian) {
+  cartesian[0] = sin(theta) * cos(phi) * spherical[0]           // x component (not x itself)
+               + cos(theta) * cos(phi) / r * spherical[1]
+               - sin(phi) / r / sin(theta) * spherical[2];
+  cartesian[1] = sin(theta) * sin(phi) * spherical[0]           // y component (not y itself)
+               + cos(theta) * sin(phi) / r * spherical[1]
+               + cos(phi) / r / sin(theta) * spherical[2];
+  cartesian[2] = cos(theta) * spherical[0]                      // z component (not z itself)
+               - sin(theta) / r * spherical[1];
+}
+
+//! Evaluate solid harmonics \f$ r^n Y_{n}^{m} \f$
+void evalMultipole(real_t rho, real_t alpha, real_t beta, complex_t *Ynm, complex_t *YnmTheta) {
+  real_t x = std::cos(alpha);                                   // x = cos(alpha)
+  real_t y = std::sin(alpha);                                   // y = sin(alpha)
+  real_t fact = 1;                                              // Initialize 2 * m + 1
+  real_t pn = 1;                                                // Initialize Legendre polynomial Pn
+  real_t rhom = 1;                                              // Initialize rho^m
+  complex_t ei = std::exp(I * beta);                            // exp(i * beta)
+  complex_t eim = 1.0;                                          // Initialize exp(i * m * beta)
+  for (int m=0; m<P; m++) {                                     // Loop over m in Ynm
+    real_t p = pn;                                              //  Associated Legendre polynomial Pnm
+    int npn = m * m + 2 * m;                                    //  Index of Ynm for m > 0
+    int nmn = m * m;                                            //  Index of Ynm for m < 0
+    Ynm[npn] = rhom * p * eim;                                  //  rho^m * Ynm for m > 0
+    Ynm[nmn] = std::conj(Ynm[npn]);                             //  Use conjugate relation for m < 0
+    real_t p1 = p;                                              //  Pnm-1
+    p = x * (2 * m + 1) * p1;                                   //  Pnm using recurrence relation
+    YnmTheta[npn] = rhom * (p - (m + 1) * x * p1) / y * eim;    // theta derivative of r^n * Ynm
+    rhom *= rho;                                                //  rho^m
+    real_t rhon = rhom;                                         //  rho^n
+    for (int n=m+1; n<P; n++) {                                 //  Loop over n in Ynm
+      int npm = n * n + n + m;                                  //   Index of Ynm for m > 0
+      int nmm = n * n + n - m;                                  //   Index of Ynm for m < 0
+      rhon /= -(n + m);                                         //   Update factorial
+      Ynm[npm] = rhon * p * eim;                                //   rho^n * Ynm
+      Ynm[nmm] = std::conj(Ynm[npm]);                           //   Use conjugate relation for m < 0
+      real_t p2 = p1;                                           //   Pnm-2
+      p1 = p;                                                   //   Pnm-1
+      p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);  //   Pnm using recurrence relation
+      YnmTheta[npm] = rhon * ((n - m + 1) * p - (n + 1) * x * p1) / y * eim;// theta derivative
+      rhon *= rho;                                              //   Update rho^n
+    }                                                           //  End loop over n in Ynm
+    rhom /= -(2 * m + 2) * (2 * m + 1);                         //  Update factorial
+    pn = -pn * fact * y;                                        //  Pn
+    fact += 2;                                                  //  2 * m + 1
+    eim *= ei;                                                  //  Update exp(i * m * beta)
+  }                                                             // End loop over m in Ynm
+}
+
+//! Evaluate singular harmonics \f$ r^{-n-1} Y_n^m \f$
+void evalLocal(real_t rho, real_t alpha, real_t beta, complex_t *Ynm) {
+  real_t x = std::cos(alpha);                                   // x = cos(alpha)
+  real_t y = std::sin(alpha);                                   // y = sin(alpha)
+  real_t fact = 1;                                              // Initialize 2 * m + 1
+  real_t pn = 1;                                                // Initialize Legendre polynomial Pn
+  real_t invR = -1.0 / rho;                                     // - 1 / rho
+  real_t rhom = -invR;                                          // Initialize rho^(-m-1)
+  complex_t ei = std::exp(I * beta);                            // exp(i * beta)
+  complex_t eim = 1.0;                                          // Initialize exp(i * m * beta)
+  for (int m=0; m<P; m++) {                                     // Loop over m in Ynm
+    real_t p = pn;                                              //  Associated Legendre polynomial Pnm
+    int npn = m * m + 2 * m;                                    //  Index of Ynm for m > 0
+    int nmn = m * m;                                            //  Index of Ynm for m < 0
+    Ynm[npn] = rhom * p * eim;                                  //  rho^(-m-1) * Ynm for m > 0
+    Ynm[nmn] = std::conj(Ynm[npn]);                             //  Use conjugate relation for m < 0
+    real_t p1 = p;                                              //  Pnm-1
+    p = x * (2 * m + 1) * p1;                                   //  Pnm using recurrence relation
+    rhom *= invR;                                               //  rho^(-m-1)
+    real_t rhon = rhom;                                         //  rho^(-n-1)
+    for (int n=m+1; n<P; n++) {                                 //  Loop over n in Ynm
+      int npm = n * n + n + m;                                  //   Index of Ynm for m > 0
+      int nmm = n * n + n - m;                                  //   Index of Ynm for m < 0
+      Ynm[npm] = rhon * p * eim;                                //   rho^n * Ynm for m > 0
+      Ynm[nmm] = std::conj(Ynm[npm]);                           //   Use conjugate relation for m < 0
+      real_t p2 = p1;                                           //   Pnm-2
+      p1 = p;                                                   //   Pnm-1
+      p = (x * (2 * n + 1) * p1 - (n + m) * p2) / (n - m + 1);  //   Pnm using recurrence relation
+      rhon *= invR * (n - m + 1);                               //   rho^(-n-1)
+    }                                                           //  End loop over n in Ynm
+    pn = -pn * fact * y;                                        //  Pn
+    fact += 2;                                                  //  2 * m + 1
+    eim *= ei;                                                  //  Update exp(i * m * beta)
+  }                                                             // End loop over m in Ynm
+}
+
+void Kernel::P2M(C_iter C) const {
+  complex_t Ynm[P*P], YnmTheta[P*P];
+  for (B_iter B=C->BODY; B!=C->BODY+C->NCBODY; B++) {
+    vec3 dX = B->X - C->X;
+    real_t R = std::sqrt(norm(dX));
+    if (R > C->RMAX) C->RMAX = R;
+    real_t rho, alpha, beta;
+    cart2sph(rho,alpha,beta,dX);
+    evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
+    for (int n=0; n<P; n++) {
+      for (int m=0; m<=n; m++) {
+        int nm  = n * n + n - m;
+        int nms = n * (n + 1) / 2 + m;
+        C->M[nms] += B->SRC * Ynm[nm];
+      }
+    }
+  }
+#if USE_RMAX
+  C->RCRIT = std::min(C->R,C->RMAX);
+#else
+  C->RCRIT = C->R;
+#endif
+}
+
+void Kernel::M2M(C_iter Ci, C_iter C0) const {
+  complex_t Ynm[P*P], YnmTheta[P*P];
+  for (C_iter Cj=C0+Ci->CHILD; Cj!=C0+Ci->CHILD+Ci->NCHILD; Cj++) {
+    vec3 dX = Ci->X - Cj->X;
+    real_t R = std::sqrt(norm(dX)) + Cj->RCRIT;
+    if (R > Ci->RMAX) Ci->RMAX = R;
+    real_t rho, alpha, beta;
+    cart2sph(rho,alpha,beta,dX);
+    evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
+    for (int j=0; j<P; j++) {
+      for (int k=0; k<=j; k++) {
+        int jks = j * (j + 1) / 2 + k;
+        complex_t M = 0;
+        for (int n=0; n<=j; n++) {
+          for (int m=std::max(-n,-j+k+n); m<=std::min(k-1,n); m++) {
+            int jnkms = (j - n) * (j - n + 1) / 2 + k - m;
+            int nm    = n * n + n - m;
+            M += Cj->M[jnkms] * Ynm[nm] * real_t(IPOW2N(m) * ODDEVEN(n));
+          }
+          for (int m=k; m<=std::min(n,j+k-n); m++) {
+            int jnkms = (j - n) * (j - n + 1) / 2 - k + m;
+            int nm    = n * n + n - m;
+            M += std::conj(Cj->M[jnkms]) * Ynm[nm] * real_t(ODDEVEN(k+n+m));
+          }
+        }
+        Ci->M[jks] += M;
+      }
+    }
+  }
+#if USE_RMAX
+  Ci->RCRIT = std::min(Ci->R,Ci->RMAX);
+#else
+  Ci->RCRIT = Ci->R;
+#endif
+}
+
+void Kernel::M2L(C_iter Ci, C_iter Cj, bool mutual) const {
+  complex_t Ynmi[P*P], Ynmj[P*P];
+  vec3 dX = Ci->X - Cj->X - Xperiodic;
+  real_t rho, alpha, beta;
+  cart2sph(rho,alpha,beta,dX);
+  evalLocal(rho,alpha,beta,Ynmi);
+  if (mutual) evalLocal(rho,alpha+M_PI,beta,Ynmj);
+  for (int j=0; j<P; j++) {
+    real_t Cnm = ODDEVEN(j);
+    for (int k=0; k<=j; k++) {
+      int jks = j * (j + 1) / 2 + k;
+      complex_t Li = 0, Lj = 0;
+      for (int n=0; n<P-j; n++) {
+        for (int m=-n; m<0; m++) {
+          int nms  = n * (n + 1) / 2 - m;
+          int jnkm = (j + n) * (j + n) + j + n + m - k;
+          Li += std::conj(Cj->M[nms]) * Cnm * Ynmi[jnkm];
+          if (mutual) Lj += std::conj(Ci->M[nms]) * Cnm * Ynmj[jnkm];
+        }
+        for (int m=0; m<=n; m++) {
+          int nms  = n * (n + 1) / 2 + m;
+          int jnkm = (j + n) * (j + n) + j + n + m - k;
+          real_t Cnm2 = Cnm * ODDEVEN((k-m)*(k<m)+m);
+          Li += Cj->M[nms] * Cnm2 * Ynmi[jnkm];
+          if (mutual) Lj += Ci->M[nms] * Cnm2 * Ynmj[jnkm];
+        }
+      }
+      Ci->L[jks] += Li;
+      if (mutual) Cj->L[jks] += Lj;
+    }
+  }
+}
+
+void Kernel::L2L(C_iter Ci, C_iter C0) const {
+  complex_t Ynm[P*P], YnmTheta[P*P];
+  C_iter Cj = C0 + Ci->PARENT;
+  vec3 dX = Ci->X - Cj->X;
+  real_t rho, alpha, beta;
+  cart2sph(rho,alpha,beta,dX);
+  evalMultipole(rho,alpha,beta,Ynm,YnmTheta);
+  for (int j=0; j<P; j++) {
+    for (int k=0; k<=j; k++) {
+      int jks = j * (j + 1) / 2 + k;
+      complex_t L = 0;
+      for (int n=j; n<P; n++) {
+        for (int m=j+k-n; m<0; m++) {
+          int jnkm = (n - j) * (n - j) + n - j + m - k;
+          int nms  = n * (n + 1) / 2 - m;
+          L += std::conj(Cj->L[nms]) * Ynm[jnkm] * real_t(ODDEVEN(k));
+        }
+        for (int m=0; m<=n; m++) {
+          if( n-j >= abs(m-k) ) {
+            int jnkm = (n - j) * (n - j) + n - j + m - k;
+            int nms  = n * (n + 1) / 2 + m;
+            L += Cj->L[nms] * Ynm[jnkm] * real_t(ODDEVEN((m-k)*(m<k)));
+          }
+        }
+      }
+      Ci->L[jks] += L;
+    }
+  }
+}
+
+void Kernel::L2P(C_iter Ci) const {
+  complex_t Ynm[P*P], YnmTheta[P*P];
+  for (B_iter B=Ci->BODY; B!=Ci->BODY+Ci->NCBODY; B++) {
+    vec3 dX = B->X - Ci->X;
+    vec3 spherical = 0;
+    vec3 cartesian = 0;
+    real_t r, theta, phi;
+    cart2sph(r,theta,phi,dX);
+    evalMultipole(r,theta,phi,Ynm,YnmTheta);
+    B->TRG /= B->SRC;
+    for (int n=0; n<P; n++) {
+      int nm  = n * n + n;
+      int nms = n * (n + 1) / 2;
+      B->TRG[0] += std::real(Ci->L[nms] * Ynm[nm]);
+      spherical[0] += std::real(Ci->L[nms] * Ynm[nm]) / r * n;
+      spherical[1] += std::real(Ci->L[nms] * YnmTheta[nm]);
+      for( int m=1; m<=n; m++) {
+        nm  = n * n + n + m;
+        nms = n * (n + 1) / 2 + m;
+        B->TRG[0] += 2 * std::real(Ci->L[nms] * Ynm[nm]);
+        spherical[0] += 2 * std::real(Ci->L[nms] * Ynm[nm]) / r * n;
+        spherical[1] += 2 * std::real(Ci->L[nms] * YnmTheta[nm]);
+        spherical[2] += 2 * std::real(Ci->L[nms] * Ynm[nm] * I) * m;
+      }
+    }
+    sph2cart(r,theta,phi,spherical,cartesian);
+    B->TRG[1] += cartesian[0];
+    B->TRG[2] += cartesian[1];
+    B->TRG[3] += cartesian[2];
+  }
+}

kernels/CMakeLists.txt

 IF(USE_GPU)
-  CUDA_ADD_LIBRARY(Kernels ${EQUATION}${BASIS}${DEVICE}.cu)
+  CUDA_ADD_LIBRARY(Kernels ${DIMENSION}${EQUATION}${BASIS}${DEVICE}.cu)
   TARGET_LINK_LIBRARIES(Kernels ${CUDA_LIBRARIES})
 ELSE()
-  ADD_LIBRARY(Kernels ${EQUATION}${BASIS}${DEVICE}.cxx)
+  ADD_LIBRARY(Kernels ${DIMENSION}${EQUATION}${BASIS}${DEVICE}.cxx)
 ENDIF()

kernels/LaplaceCartesianCPU.cxx

-#include "kernel.h"
-const real_t EPS = 1e-12;                                       // Double precision epsilon
-
-template<int nx, int ny, int nz>
-struct Index {
-  static const int                I = Index<nx,ny+1,nz-1>::I + 1;
-  static const unsigned long long F = Index<nx,ny,nz-1>::F * nz;
-};
-
-template<int nx, int ny>
-struct Index<nx,ny,0> {
-  static const int                I = Index<nx+1,0,ny-1>::I + 1;
-  static const unsigned long long F = Index<nx,ny-1,0>::F * ny;
-};
-
-template<int nx>
-struct Index<nx,0,0> {
-  static const int                I = Index<0,0,nx-1>::I + 1;
-  static const unsigned long long F = Index<nx-1,0,0>::F * nx;
-};
-
-template<>
-struct Index<0,0,0> {
-  static const int                I = 0;
-  static const unsigned long long F = 1;
-};
-
-
-template<int n, int kx, int ky , int kz, int d>
-struct DerivativeTerm {
-  static const int coef = 1 - 2 * n;
-  static inline real_t kernel(const vecP &C, const vec3 &dX) {
-    return coef * dX[d] * C[Index<kx,ky,kz>::I];
-  }
-};
-
-template<int n, int kx, int ky , int kz>
-struct DerivativeTerm<n,kx,ky,kz,-1> {
-  static const int coef = 1 - n;
-  static inline real_t kernel(const vecP &C, const vec3&) {
-    return coef * C[Index<kx,ky,kz>::I];
-  }
-};
-
-
-template<int nx, int ny, int nz, int kx=nx, int ky=ny, int kz=nz, int flag=5>
-struct DerivativeSum {
-  static const int nextflag = 5 - (kz < nz || kz == 1);
-  static const int dim = kz == (nz-1) ? -1 : 2;
-  static const int n = nx + ny + nz;
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,nx,ny,kz-1,nextflag>::loop(C,dX)
-         + DerivativeTerm<n,nx,ny,kz-1,dim>::kernel(C,dX);
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky, int kz>
-struct DerivativeSum<nx,ny,nz,kx,ky,kz,4> {
-  static const int nextflag = 3 - (ny == 0);
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dX);
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky, int kz>
-struct DerivativeSum<nx,ny,nz,kx,ky,kz,3> {
-  static const int nextflag = 3 - (ky < ny || ky == 1);
-  static const int dim = ky == (ny-1) ? -1 : 1;
-  static const int n = nx + ny + nz;
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,nx,ky-1,nz,nextflag>::loop(C,dX)
-         + DerivativeTerm<n,nx,ky-1,nz,dim>::kernel(C,dX);
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky, int kz>
-struct DerivativeSum<nx,ny,nz,kx,ky,kz,2> {
-  static const int nextflag = 1 - (nx == 0);
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,nx,ny,nz,nextflag>::loop(C,dX);
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky, int kz>
-struct DerivativeSum<nx,ny,nz,kx,ky,kz,1> {
-  static const int nextflag = 1 - (kx < nx || kx == 1);
-  static const int dim = kx == (nx-1) ? -1 : 0;
-  static const int n = nx + ny + nz;
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,kx-1,ny,nz,nextflag>::loop(C,dX)
-         + DerivativeTerm<n,kx-1,ny,nz,dim>::kernel(C,dX);
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky, int kz>
-struct DerivativeSum<nx,ny,nz,kx,ky,kz,0> {
-  static inline real_t loop(const vecP&, const vec3&) {
-    return 0;
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky>
-struct DerivativeSum<nx,ny,nz,kx,ky,0,5> {
-  static inline real_t loop(const vecP &C, const vec3 &dX) {
-    return DerivativeSum<nx,ny,nz,nx,ny,0,4>::loop(C,dX);
-  }
-};
-
-
-template<int nx, int ny, int nz, int kx=nx, int ky=ny, int kz=nz>
-struct MultipoleSum {
-  static inline real_t kernel(const vecP &C, const vecP &M) {
-    return MultipoleSum<nx,ny,nz,kx,ky,kz-1>::kernel(C,M)
-         + C[Index<nx-kx,ny-ky,nz-kz>::I]*M[Index<kx,ky,kz>::I];
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky>
-struct MultipoleSum<nx,ny,nz,kx,ky,0> {
-  static inline real_t kernel(const vecP &C, const vecP &M) {
-    return MultipoleSum<nx,ny,nz,kx,ky-1,nz>::kernel(C,M)
-         + C[Index<nx-kx,ny-ky,nz>::I]*M[Index<kx,ky,0>::I];
-  }
-};
-
-template<int nx, int ny, int nz, int kx>
-struct MultipoleSum<nx,ny,nz,kx,0,0> {
-  static inline real_t kernel(const vecP &C, const vecP &M) {
-    return MultipoleSum<nx,ny,nz,kx-1,ny,nz>::kernel(C,M)
-         + C[Index<nx-kx,ny,nz>::I]*M[Index<kx,0,0>::I];
-  }
-};
-
-template<int nx, int ny, int nz>
-struct MultipoleSum<nx,ny,nz,0,0,0> {
-  static inline real_t kernel(const vecP&, const vecP&) { return 0; }
-};
-
-
-template<int nx, int ny, int nz, int kx=0, int ky=0, int kz=P-1-nx-ny-nz>
-struct LocalSum {
-  static inline real_t kernel(const vecP &M, const vecP &L) {
-    return LocalSum<nx,ny,nz,kx,ky+1,kz-1>::kernel(M,L)
-         + M[Index<kx,ky,kz>::I] * L[Index<nx+kx,ny+ky,nz+kz>::I];
-  }
-};
-
-template<int nx, int ny, int nz, int kx, int ky>
-struct LocalSum<nx,ny,nz,kx,ky,0> {
-  static inline real_t kernel(const vecP &M, const vecP &L) {
-    return LocalSum<nx,ny,nz,kx+1,0,ky-1>::kernel(M,L)
-         + M[Index<kx,ky,0>::I] * L[Index<nx+kx,ny+ky,nz>::I];
-  }
-};
-
-template<int nx, int ny, int nz, int kx>
-struct LocalSum<nx,ny,nz,kx,0,0> {
-  static inline real_t kernel(const vecP &M, const vecP &L) {
-    return LocalSum<nx,ny,nz,0,0,kx-1>::kernel(M,L)
-         + M[Index<kx,0,0>::I] * L[Index<nx+kx,ny,nz>::I];
-  }
-};
-
-template<int nx, int ny, int nz>
-struct LocalSum<nx,ny,nz,0,0,0> {
-  static inline real_t kernel(const vecP&, const vecP&) { return 0; }
-};
-
-
-template<int nx, int ny, int nz>
-struct Kernels {
-  static inline void power(vecP &C, const vec3 &dX) {
-    Kernels<nx,ny+1,nz-1>::power(C,dX);
-    C[Index<nx,ny,nz>::I] = C[Index<nx,ny,nz-1>::I] * dX[2] / nz;
-  }
-  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
-    static const int n = nx + ny + nz;
-    Kernels<nx,ny+1,nz-1>::derivative(C,dX,invR2);
-    C[Index<nx,ny,nz>::I] = DerivativeSum<nx,ny,nz>::loop(C,dX) / n * invR2;
-  }
-  static inline void scale(vecP &C) {
-    Kernels<nx,ny+1,nz-1>::scale(C);
-    C[Index<nx,ny,nz>::I] *= Index<nx,ny,nz>::F;
-  }
-  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
-    Kernels<nx,ny+1,nz-1>::M2M(MI,C,MJ);
-    MI[Index<nx,ny,nz>::I] += MultipoleSum<nx,ny,nz>::kernel(C,MJ);
-  }
-  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
-    Kernels<nx,ny+1,nz-1>::M2L(L,C,M);
-    L[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(M,C);
-  }
-  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
-    Kernels<nx,ny+1,nz-1>::L2L(LI,C,LJ);
-    LI[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(C,LJ);
-  }
-  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
-    Kernels<nx,ny+1,nz-1>::L2P(B,C,L);
-    B->TRG[Index<nx,ny,nz>::I] += LocalSum<nx,ny,nz>::kernel(C,L);
-  }
-};
-
-template<int nx, int ny>
-struct Kernels<nx,ny,0> {
-  static inline void power(vecP &C, const vec3 &dX) {
-    Kernels<nx+1,0,ny-1>::power(C,dX);
-    C[Index<nx,ny,0>::I] = C[Index<nx,ny-1,0>::I] * dX[1] / ny;
-  }
-  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
-    static const int n = nx + ny;
-    Kernels<nx+1,0,ny-1>::derivative(C,dX,invR2);
-    C[Index<nx,ny,0>::I] = DerivativeSum<nx,ny,0>::loop(C,dX) / n * invR2;
-  }
-  static inline void scale(vecP &C) {
-    Kernels<nx+1,0,ny-1>::scale(C);
-    C[Index<nx,ny,0>::I] *= Index<nx,ny,0>::F;
-  }
-  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
-    Kernels<nx+1,0,ny-1>::M2M(MI,C,MJ);
-    MI[Index<nx,ny,0>::I] += MultipoleSum<nx,ny,0>::kernel(C,MJ);
-  }
-  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
-    Kernels<nx+1,0,ny-1>::M2L(L,C,M);
-    L[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(M,C);
-  }
-  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
-    Kernels<nx+1,0,ny-1>::L2L(LI,C,LJ);
-    LI[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(C,LJ);
-  }
-  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
-    Kernels<nx+1,0,ny-1>::L2P(B,C,L);
-    B->TRG[Index<nx,ny,0>::I] += LocalSum<nx,ny,0>::kernel(C,L);
-  }
-};
-
-template<int nx>
-struct Kernels<nx,0,0> {
-  static inline void power(vecP &C, const vec3 &dX) {
-    Kernels<0,0,nx-1>::power(C,dX);
-    C[Index<nx,0,0>::I] = C[Index<nx-1,0,0>::I] * dX[0] / nx;
-  }
-  static inline void derivative(vecP &C, const vec3 &dX, const real_t &invR2) {
-    static const int n = nx;
-    Kernels<0,0,nx-1>::derivative(C,dX,invR2);
-    C[Index<nx,0,0>::I] = DerivativeSum<nx,0,0>::loop(C,dX) / n * invR2;
-  }
-  static inline void scale(vecP &C) {
-    Kernels<0,0,nx-1>::scale(C);
-    C[Index<nx,0,0>::I] *= Index<nx,0,0>::F;
-  }
-  static inline void M2M(vecP &MI, const vecP &C, const vecP &MJ) {
-    Kernels<0,0,nx-1>::M2M(MI,C,MJ);
-    MI[Index<nx,0,0>::I] += MultipoleSum<nx,0,0>::kernel(C,MJ);
-  }
-  static inline void M2L(vecP &L, const vecP &C, const vecP &M) {
-    Kernels<0,0,nx-1>::M2L(L,C,M);
-    L[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(M,C);
-  }
-  static inline void L2L(vecP &LI, const vecP &C, const vecP &LJ) {
-    Kernels<0,0,nx-1>::L2L(LI,C,LJ);
-    LI[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(C,LJ);
-  }
-  static inline void L2P(B_iter B, const vecP &C, const vecP &L) {
-    Kernels<0,0,nx-1>::L2P(B,C,L);
-    B->TRG[Index<nx,0,0>::I] += LocalSum<nx,0,0>::kernel(C,L);
-  }
-};
-
-template<>
-struct Kernels<0,0,0> {
-  static inline void power(vecP&, const vec3&) {}
-  static inline void derivative(vecP&, const vec3&, const real_t&) {}
-  static inline void scale(vecP&) {}
-  static inline void M2M(vecP&, const vecP&, const vecP&) {}
-  static inline void M2L(vecP&, const vecP&, const vecP&) {}
-  static inline void L2L(vecP&, const vecP&, const vecP&) {}
-  static inline void L2P(B_iter, const vecP&, const vecP&) {}
-};
-
-
-template<int PP>
-inline void getCoef(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
-  C[0] = invR;
-  Kernels<0,0,PP>::derivative(C,dX,invR2);
-  Kernels<0,0,PP>::scale(C);
-}
-
-template<>
-inline void getCoef<1>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
-  C[0] = invR;
-  invR2 = -invR2;
-  real_t x = dX[0], y = dX[1], z = dX[2];
-  real_t invR3 = invR * invR2;
-  C[1] = x * invR3;
-  C[2] = y * invR3;
-  C[3] = z * invR3;
-}
-
-template<>
-inline void getCoef<2>(vecP &C, const vec3 &dX, real_t &invR2, const real_t &invR) {
-  getCoef<1>(C,dX,invR2,invR);