Rio Yokota avatar Rio Yokota committed e73b65d

2*P -> P

Comments (0)

Files changed (36)

 
 *.a
 *.dat
-*.dot
 *.g80emu
 *.o
 *.out
-*.plist
 *.so
-*.svg
-*.swp
-.Apple*
+a594c63f6ae633a0d84666c71b789e1cb5537887 working copy with enasyunis/gromacs
 EXPAND  = Spherical
 
 ### GCC compiler
+#CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -fopenmp -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 CXX	= mpicxx -ggdb3 -Wall -Wextra -O3 -ffast-math -funroll-loops -fforce-addr -fPIC -I../include
 ### Intel compiler
 #CXX	= mpicxx -Wall -xHOST -O3 -openmp -funroll-loops -finline-functions -fPIC -ansi-alias -I../include
-
 ### BG/P compiler
 #CXX	= mpixlcxx -qarch=450 -qtune=450 -I../include
-
 ### K computer
-#CXX	= mpiFCCpx -Kopenmp,fast,ocl,preex,array_private -Ksimd=2,NOFLTLD,mfunc=3,optmsg=2 -Nsrc,sta -I../include -DNDEBUG
-# -Kdalign,ns,mfunc,eval,prefetch_conditional,ilfunc -x-
-
+#CXX	= mpiFCCpx -Kfast,openmp
 ### CUDA compiler
-NVCC    = nvcc --compiler-bindir=/usr/bin/gcc-4.4 -Xcompiler -fopenmp --ptxas-options=-v -O3\
+NVCC    = nvcc -Xcompiler -fopenmp --ptxas-options=-v -O3\
 	 -use_fast_math -arch=sm_21 -I../include -I$(CUDA_INSTALL_PATH)/include
 
 ### Base flags
 
 ifeq ($(DEVICE),GPU)
 ### CUDA flags
-LFLAGS  += -DQUEUE -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart -lstdc++ -ldl -lm
+LFLAGS  += -DQUEUE -L$(CUDA_INSTALL_PATH)/lib64 -lcuda -lcudart
 endif
 
 OBJECT = ../kernel/$(DEVICE)$(EXPAND)Laplace.o ../kernel/$(DEVICE)VanDerWaals.o\

include/bottomup.h

     const long N = bodies.size() * MPISIZE;                     // Number of bodies
     int level;                                                  // Max level
     level = N >= NCRIT ? 1 + int(log(N / NCRIT)/M_LN2/3) : 0;   // Decide max level from N/Ncrit
+    if( level < 2 ) level = 2;
     int MPIlevel = int(log(MPISIZE-1) / M_LN2 / 3) + 1;         // Level of local root cell
     if( MPISIZE == 1 ) MPIlevel = 0;                            // For serial execution local root cell is root cell
     if( MPIlevel > level ) {                                    // If process hierarchy is deeper than tree
-//      std::cout << "Process hierarchy is deeper than tree @ rank" << MPIRANK << std::endl;
+      std::cout << "Process hierarchy is deeper than tree @ rank" << MPIRANK << std::endl;
       level = MPIlevel;
     }
     return level;                                               // Return max level

include/dataset.h

     file.close();                                               // Close file
   }
 
+  void sampleBodies(Bodies &bodies, int numTargets) {
+    int n = bodies.size();
+    int p = n / numTargets;
+    assert(p > 0);
+    for (int i=0; i<numTargets; i++) {
+      assert(i * p < n);
+      bodies[i] = bodies[i*p];
+    }
+    bodies.resize(numTargets);
+  }
+
 //! Evaluate relative L2 norm error
   void evalError(Bodies &bodies, Bodies &bodies2,
                  real &diff1, real &norm1, real &diff2, real &norm2, bool ewald=false) {
     file.close();                                               // Close file
   }
 
+  void sampleBodies(Bodies &bodies, int numTargets) {
+    int n = bodies.size();
+    int p = n / numTargets;
+    assert(p > 0);
+    for (int i=0; i<numTargets; i++) {
+      assert(i * p < n);
+      bodies[i] = bodies[i*p];
+    }
+    bodies.resize(numTargets);
+  }
+
 //! Evaluate relative L2 norm error
   void evalError(Bodies &bodies, Bodies &bodies2,
                  real &diff1, real &norm1, real &diff2, real &norm2) {

include/evaluator.h

   void traverseQueue(Pair pair) {
     PairQueue pairQueue;                                        // Queue of interacting cell pairs
 #if QUARK
-    Quark *quark = QUARK_New(4);                                // Initialize QUARK object
-    C_iter root = pair.first;                                   // Iterator for root target cell
+    Quark *quark = QUARK_New(12);                                // Initialize QUARK object
 #endif
     pairQueue.push_back(pair);                                  // Push pair to queue
     while( !pairQueue.empty() ) {                               // While dual traversal queue is not empty
         }                                                       //   End loop over second cell's children
       }                                                         //  End if for which cell to split
 #if QUARK
-      if( int(pairQueue.size()) > root->NDLEAF / 100 ) {        //  When queue size reaches threshold
+      if( int(pairQueue.size()) > 100 ) {                       //  When queue size reaches threshold
         while( !pairQueue.empty() ) {                           //   While dual traversal queue is not empty
           pair = pairQueue.front();                             //    Get interaction pair from front of queue
           pairQueue.pop_front();                                //    Pop dual traversal queue
       for( int ix=-1; ix<=1; ++ix ) {                           //  Loop over x periodic direction
         for( int iy=-1; iy<=1; ++iy ) {                         //   Loop over y periodic direction
           for( int iz=-1; iz<=1; ++iz ) {                       //    Loop over z periodic direction
-            cell.X[0] = C->X[0] + ix * 2 * C->R;                //     Set new x coordinate for periodic image
-            cell.X[1] = C->X[1] + iy * 2 * C->R;                //     Set new y cooridnate for periodic image
-            cell.X[2] = C->X[2] + iz * 2 * C->R;                //     Set new z coordinate for periodic image
-            cell.M = C->M;                                      //     Copy multipoles to new periodic image
-            pjcells.push_back(cell);                            //     Push cell into periodic jcell vector
+            if( ix != 0 || iy != 0 || iz != 0 ) {               //     If periodic cell is not at center
+              cell.X[0] = C->X[0] + ix * 2 * C->R;              //      Set new x coordinate for periodic image
+              cell.X[1] = C->X[1] + iy * 2 * C->R;              //      Set new y cooridnate for periodic image
+              cell.X[2] = C->X[2] + iz * 2 * C->R;              //      Set new z coordinate for periodic image
+              cell.M = C->M;                                    //      Copy multipoles to new periodic image
+              pjcells.push_back(cell);                          //      Push cell into periodic jcell vector
+            }                                                   //     Endif for periodic center cell
           }                                                     //    End loop over z periodic direction
         }                                                       //   End loop over y periodic direction
       }                                                         //  End loop over x periodic direction
       pccells.push_back(cell);                                  //  Push cell into periodic cell vector
       C_iter Ci = pccells.end() - 1;                            //  Set current cell as target for M2M
       Ci->CHILD = 0;                                            //  Set child cells for periodic M2M
-      Ci->NCHILD = 27;                                          //  Set number of child cells for periodic M2M
+      Ci->NCHILD = 26;                                          //  Set number of child cells for periodic M2M
       evalM2M(pccells,pjcells);                                 // Evaluate periodic M2M kernels for this sublevel
       pjcells.clear();                                          // Clear periodic jcell vector
     }                                                           // End loop over sublevels of tree
     flagM2P[0][Cj] |= Icenter;                                  // Flip bit of periodic image flag
   }
 
-//! Create periodic images of bodies
-  Bodies periodicBodies(Bodies &bodies) {
-    Bodies jbodies;                                             // Vector for periodic images of bodies
-    int prange = getPeriodicRange();                            // Get range of periodic images
-    for( int ix=-prange; ix<=prange; ++ix ) {                   // Loop over x periodic direction
-      for( int iy=-prange; iy<=prange; ++iy ) {                 //  Loop over y periodic direction
-        for( int iz=-prange; iz<=prange; ++iz ) {               //   Loop over z periodic direction
-          for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {//    Loop over bodies
-            Body body = *B;                                     //     Copy current body
-            body.X[0] += ix * 2 * R0;                           //     Shift x position
-            body.X[1] += iy * 2 * R0;                           //     Shift y position
-            body.X[2] += iz * 2 * R0;                           //     Shift z position
-            jbodies.push_back(body);                            //     Push shifted body into jbodies
-          }                                                     //    End loop over bodies
-        }                                                       //   End loop over z periodic direction
-      }                                                         //  End loop over y periodic direction
-    }                                                           // End loop over x periodic direction
-    return jbodies;                                             // Return vector for periodic images of bodies
-  }
-
 //! Use multipole acceptance criteria to determine whether to approximate, do P2P, or subdivide
   void interact(C_iter Ci, C_iter Cj, PairQueue &pairQueue) {
     vect dX = Ci->X - Cj->X - Xperiodic;                        // Distance vector from source to target
           }                                                     //    End loop over z periodic direction
         }                                                       //   End loop over y periodic direction
       }                                                         //  End loop over x periodic direction
+#if QUEUE
       for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) {   //  Loop over target cells
         listM2L[Ci-Ci0].sort();                                 //  Sort interaction list
         listM2L[Ci-Ci0].unique();                               //  Eliminate duplicate periodic entries
         listP2P[Ci-Ci0].sort();                                 //  Sort interaction list
         listP2P[Ci-Ci0].unique();                               //  Eliminate duplicate periodic entries
       }                                                         //  End loop over target cells
+#endif
     }                                                           // Endif for periodic boundary condition
   }
 
 //! Traverse neighbor cells only (for cutoff based methods)
   void neighbor(Cells &cells, Cells &jcells) {
-    C_iter root = cells.end() - 1;                              // Iterator for root target cell
     C_iter jroot = jcells.end() - 1;                            // Iterator for root source cell
     Ci0 = cells.begin();                                        // Set begin iterator for target cells
     Cj0 = jcells.begin();                                       // Set begin iterator for source cells
   void clearBuffers();                                          //!< Clear GPU buffers
 
   void evalP2P(Bodies &ibodies, Bodies &jbodies, bool onCPU=false);//!< Evaluate all P2P kernels (all pairs)
+  void periodicP2P(Bodies &ibodies, Bodies &jbodies, bool onCPU=false);//!< Evaluate all P2P kernels (all pairs) for periodic
   void evalP2M(Cells &cells);                                   //!< Evaluate all P2M kernels
   void evalM2M(Cells &cells, Cells &jcells);                    //!< Evaluate all M2M kernels
   void evalM2L(C_iter Ci, C_iter Cj);                           //!< Evaluate on CPU, queue on GPU
                  sourceDevcSize(0), targetDevcSize(0),
                  keysDevc(), rangeDevc(), sourceDevc(), targetDevc(),
                  factorial(), prefactor(), Anm(), Cnm(),
-                 X0(0), R0(0) {}
+                 X0(0), R0(-1/EPS) {}
 //! Destructor
   ~KernelBase() {}
 //! Copy constructor
                  sourceDevcSize(0), targetDevcSize(0),
                  keysDevc(), rangeDevc(), sourceDevc(), targetDevc(),
                  factorial(), prefactor(), Anm(), Cnm(),
-                 X0(0), R0(0) {}
+                 X0(0), R0(-1/EPS) {}
 //! Overload assignment
   KernelBase &operator=(const KernelBase) {return *this;}
 
 //! Set center and size of root cell
   void setDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
     vect xmin,xmax;                                             // Min,Max of domain
-    B_iter B = bodies.begin();                                  // Reset body iterator
-    xmin = xmax = B->X;                                         // Initialize xmin,xmax
-    for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
+    for( int d=0; d!=3; ++d ) {                                 //  Loop over each dimension
+      xmin[d] = x0[d] - r0;                                     //   Initialize xmin
+      xmax[d] = x0[d] + r0;                                     //   Initialize xmax
+    }                                                           //  End loop over each dimension
+    for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {      // Loop over bodies
       for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
         if     (B->X[d] < xmin[d]) xmin[d] = B->X[d];           //   Determine xmin
         else if(B->X[d] > xmax[d]) xmax[d] = B->X[d];           //   Determine xmax
       }                                                         //  End loop over each dimension
     }                                                           // End loop over bodies
-/*
-    if( xmin[0] < x0[0]-r0 || x0[0]+r0 < xmax[0]                //  Check for outliers in x direction
-     || xmin[1] < x0[1]-r0 || x0[1]+r0 < xmax[1]                //  Check for outliers in y direction
-     || xmin[2] < x0[2]-r0 || x0[2]+r0 < xmax[2] ) {            //  Check for outliers in z direction
-      std::cout << "Error: Particles located outside periodic domain : " << std::endl;// Print error message
-      std::cout << xmin << std::endl;
-      std::cout << xmax << std::endl;
-    }                                                         //  End if for outlier checking
-*/
-    X0 = x0;                                                  //  Center is [0, 0, 0]
-    R0 = r0;                                                  //  Radius is r0
+    if( IMAGES != 0 ) {                                         // If periodic boundary
+      if( xmin[0] < x0[0]-r0 || x0[0]+r0 < xmax[0]              //  Check for outliers in x direction
+       || xmin[1] < x0[1]-r0 || x0[1]+r0 < xmax[1]              //  Check for outliers in y direction
+       || xmin[2] < x0[2]-r0 || x0[2]+r0 < xmax[2] ) {          //  Check for outliers in z direction
+        std::cout << "Error: Particles located outside periodic domain : " << std::endl;// Print error message
+        std::cout << xmin << std::endl;                         //   Print error message
+        std::cout << xmax << std::endl;                         //   Print error message
+      }                                                         //  End if for outlier checking
+      X0 = x0;                                                  //  Center is x0
+      R0 = r0;                                                  //  Radius is r0
+    } else {                                                    // If free boundary
+      for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
+        X0[d] = (xmin[d] + xmax[d]) * .5;                       //   Center of domain
+        R0 = std::max(X0[d]-xmin[d],R0);                        //   Radius of domain
+        R0 = std::max(xmax[d]-X0[d],R0);                        //   Radius of domain
+      }                                                         //  End loop over each dimension
+      R0 *= (1 + EPS);                                          //  Add some leeway to domain size
+    }                                                           // End if for periodic boundary
   }
 
 //! Precalculate M2L translation matrix
 #include <mpi.h>
 #include <typeinfo>
 #include "types.h"
+#include <unistd.h>
 
 //! Custom MPI utilities
 class MyMPI {

include/parallelfmm.h

       sendBodyDsp[i] *= bytes;                                  //  Multiply by bytes
       recvBodyCnt[i] *= bytes;                                  //  Multiply by bytes
       recvBodyDsp[i] *= bytes;                                  //  Multiply by bytes
-      if(printNow&&i!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << i << " : " << sendBodyCnt[i] << " Bytes for P2P" << std::endl;
     }                                                           // End loop over ranks
     MPI_Alltoallv(&sendBodies[0],&sendBodyCnt[0],&sendBodyDsp[0],MPI_BYTE,
                   &recvBodies[0],&recvBodyCnt[0],&recvBodyDsp[0],MPI_BYTE,MPI_COMM_WORLD);
       sendCellCnt[irank] = sendCells.size()-ssize;              //  Set cell send count of current rank
       sendCellDsp[irank] = ssize;                               //  Set cell send displacement of current rank
       ssize += sendCellCnt[irank];                              //  Increment offset for vector send cells
-      if( printNow ) {
-        int sendSize[10] = {0,0,0,0,0,0,0,0,0,0};
-        for( JC_iter JC=sendCells.begin(); JC!=sendCells.end(); ++JC ) {
-          int level = getLevel(JC->ICELL);
-          if(sendCellDsp[irank] <= JC-sendCells.begin() && JC-sendCells.begin() < ssize )
-            sendSize[level] += sizeof(sendCells[0]);
-        }
-        for( int l=0; l<10; l++ ) {
-          if(irank!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << irank << " : " << sendSize[l] << " Bytes @ level " << l << std::endl;
-        }
-      }
     }                                                           // End loop over ranks
     stopTimer("Get LET",printNow);                              // Stop timer
     startTimer("Alltoall C");                                   // Start timer
       sendCellDsp[i] *= bytes;                                  //  Multiply by bytes
       recvCellCnt[i] *= bytes;                                  //  Multiply by bytes
       recvCellDsp[i] *= bytes;                                  //  Multiply by bytes
-      if(printNow&&i!=0) std::cout << "Rank " << MPIRANK << " sends to rank " << i << " : " << sendCellCnt[i] << " Bytes for M2L total" << std::endl;
     }                                                           // End loop over ranks
     MPI_Alltoallv(&sendCells[0],&sendCellCnt[0],&sendCellDsp[0],MPI_BYTE,
                   &recvCells[0],&recvCellCnt[0],&recvCellDsp[0],MPI_BYTE,MPI_COMM_WORLD);

include/partition.h

     if(MPISIZE == 1) LEVEL = 0;                                 // Level is 0 for a serial execution
     XMIN.resize(LEVEL+1);                                       // Minimum position vector at each level
     XMAX.resize(LEVEL+1);                                       // Maximum position vector at each level
+    for( int d=0; d!=3; ++d ) {                                 // Loop over each dimension
+      XMIN[0][d] = 1 / EPS;                                     //  Initialize XMIN[0]
+      XMAX[0][d] = - 1 / EPS;                                   //  Initialize XMAX[0]
+    }                                                           // End loop over each dimension
     startTimer("Split comm");                                   // Start timer
     nprocs[0][0] = nprocs[0][1] = MPISIZE;                      // Initialize number of processes in groups
     offset[0][0] = offset[0][1] = 0;                            // Initialize offset of body in groups
 
 //! Set bounds of domain to be partitioned
   void setGlobDomain(Bodies &bodies, vect x0=0, real r0=M_PI) {
-    numCells1D = 1 << getMaxLevel(bodies);                      // Set initial number of bodies
+    numCells1D = 1 << getMaxLevel(bodies);                      // Number of leaf cells in each dimensions
     B_iter B = bodies.begin();                                  // Reset body iterator
-    XMIN[0] = XMAX[0] = B->X;                                   // Initialize xmin,xmax
     MPI_Datatype MPI_TYPE = getType(XMIN[0][0]);                // Get MPI data type
     for( B=bodies.begin(); B!=bodies.end(); ++B ) {             // Loop over bodies
       for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
     XMAX[0] = X;                                                // Get data from buffer
     MPI_Allreduce(XMIN[0],X,3,MPI_TYPE,MPI_MIN,MPI_COMM_WORLD); // Reduce global minimum
     XMIN[0] = X;                                                // Get data from buffer
-    if( XMIN[0][0] < x0[0]-r0 || x0[0]+r0 < XMAX[0][0]          //  Check for outliers in x direction
-     || XMIN[0][1] < x0[1]-r0 || x0[1]+r0 < XMAX[0][1]          //  Check for outliers in y direction
-     || XMIN[0][2] < x0[2]-r0 || x0[2]+r0 < XMAX[0][2] ) {      //  Check for outliers in z direction
-      std::cout << "Error: Particles located outside periodic domain @ rank "
+    if( IMAGES != 0 ) {                                         // If periodic boundary
+      if( XMIN[0][0] < x0[0]-r0 || x0[0]+r0 < XMAX[0][0]        //  Check for outliers in x direction
+       || XMIN[0][1] < x0[1]-r0 || x0[1]+r0 < XMAX[0][1]        //  Check for outliers in y direction
+       || XMIN[0][2] < x0[2]-r0 || x0[2]+r0 < XMAX[0][2] ) {    //  Check for outliers in z direction
+        std::cout << "Error: Particles located outside periodic domain @ rank "
                 << MPIRANK << std::endl;                        //   Print error message
-    }                                                           //  End if for outlier checking
-    X0 = x0;                                                    //  Center is [0, 0, 0]
-    R0 = r0;                                                    //  Radius is M_PI
+      }                                                         //  End if for outlier checking
+      X0 = x0;                                                  //  Center is x0
+      R0 = r0;                                                  //  Radius is r0
+    } else {                                                    // If free boundary
+      for( int d=0; d!=3; ++d ) {                               //  Loop over each dimension
+        X0[d] = (XMIN[0][d] + XMAX[0][d]) * .5;                 //   Center of domain
+        R0 = std::max(X0[d]-XMIN[0][d],R0);                     //   Radius of domain
+        R0 = std::max(XMAX[0][d]-X0[d],R0);                     //   Radius of domain
+      }                                                         //  End loop over each dimension
+      R0 *= (1 + EPS);                                          //  Add some leeway to domain size
+    }                                                           // End if for periodic boundary
     XMAX[0] = X0 + R0;                                          // Reposition global maximum
     XMIN[0] = X0 - R0;                                          // Reposition global minimum
   }
         usleep(WAIT);                                           //   Wait for "WAIT" milliseconds
         if( MPIRANK == j ) {                                    //   If it's my turn to print
           std::cout << "MPIRANK : " << j << std::endl;          //    Print rank
-          for(int i=0; i!=9; ++i) std::cout << bodies[numLocal/10*i].I << " ";// Print sampled body indices
+          for(int i=0; i!=9; ++i) std::cout << bodies[numLocal/10*i].ICELL << " ";// Print sampled body indices
           std::cout << std::endl;                               //    New line
         }                                                       //   Endif for my turn
       }                                                         //  End loop over ranks
 #endif
 #if QUARK
 #include "quark.h"
-#endif
-#if MTHREADS
-#include <mttb/task_group.h>
-int omp_get_thread_num() {
-  return 0;
-}
-#define OMP_NUM_THREADS 1
 #else
 #include <omp.h>
-#define OMP_NUM_THREADS 12
 #endif
 
 typedef unsigned           bigint;                              //!< Big integer type
 #endif
 #endif
 
-const int  P        = 10;                                       //!< Order of expansions
-const int  NCRIT    = 100;                                      //!< Number of bodies per cell
-const int  MAXBODY  = 200000;                                   //!< Maximum number of bodies per GPU kernel
+const int  P        = 8;                                        //!< Order of expansions
+const int  NCRIT    = 64;                                       //!< Number of bodies per cell
+const int  MAXBODY  = 50000;                                    //!< Maximum number of bodies per GPU kernel
 const int  MAXCELL  = 10000000;                                 //!< Maximum number of bodies/coefs in cell per GPU kernel
 const real CLET     = 2;                                        //!< LET opening critetia
-const real EPS      = 1e-6;                                     //!< Single precision epsilon
+const real EPS      = 1e-6;                                     //!< Single/double precision epsilon
 const real EPS2     = 0;                                        //!< Softening parameter (squared)
 const real R2MIN    = 0.0001;                                   //!< Minimum value for L-J R^2
 const real R2MAX    = 100.0;                                    //!< Maximum value for L-J R^2
 const int  GPUS     = 3;                                        //!< Number of GPUs per node
 const int  THREADS  = 64;                                       //!< Number of threads per thread-block
-const int  PTHREADS = 4;                                        //!< Number of pthreads in quark
 
 const int MTERM = P*(P+1)*(P+2)/6;                              //!< Number of Cartesian mutlipole terms
 const int LTERM = (P+1)*(P+2)*(P+3)/6;                          //!< Number of Cartesian local terms

kernel/CPUCartesianLaplace.cxx

 void Kernel<Laplace>::P2M(C_iter Ci) {
   real Rmax = 0;
 //  setCenter(Ci);
-  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = Ci->X - B->X;
     real R = std::sqrt(norm(dist));
     if( R > Rmax ) Rmax = R;
 
 template<>
 void Kernel<Laplace>::L2P(C_iter Ci) const {
-  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NCLEAF; ++B ) {
+  for( B_iter B=Ci->LEAF; B!=Ci->LEAF+Ci->NDLEAF; ++B ) {
     vect dist = B->X - Ci->X;
     Lset C, L;
     C[0] = 1;

kernel/CPUEvaluator.cxx

 */
 
 template<Equation equation>
-void Evaluator<equation>::evalP2P(Bodies &ibodies, Bodies &jbodies, bool) {// Evaluate all P2P kernels
-  Xperiodic = 0;                                                // Set periodic coordinate offset
+void Evaluator<equation>::evalP2P(Bodies &ibodies, Bodies &jbodies, bool) {// Evaluate all P2P kernels for periodic
   Cells cells;                                                  // Cells to put target and source bodies
   cells.resize(2);                                              // Resize cells to put target and source bodies
   cells[0].LEAF = ibodies.begin();                              // Iterator of first target leaf
   cells[1].LEAF = jbodies.begin();                              // Iterator of first source leaf
   cells[1].NDLEAF = jbodies.size();                             // Number of source leafs
   C_iter Ci = cells.begin(), Cj = cells.begin()+1;              // Iterator of target and source cells
-  P2P(Ci,Cj);                                                   // Perform P2P kernel
+  int prange = getPeriodicRange();                              // Get range of periodic images
+  for( int ix=-prange; ix<=prange; ++ix ) {                     // Loop over x periodic direction
+    for( int iy=-prange; iy<=prange; ++iy ) {                   //  Loop over y periodic direction
+      for( int iz=-prange; iz<=prange; ++iz ) {                 //   Loop over z periodic direction
+        Xperiodic[0] = ix * 2 * R0;                             //    Shift x position
+        Xperiodic[1] = iy * 2 * R0;                             //    Shift y position
+        Xperiodic[2] = iz * 2 * R0;                             //    Shift z position
+        P2P(Ci,Cj);                                             //    Perform P2P kernel
+      }                                                         //   End loop over z periodic direction
+    }                                                           //  End loop over y periodic direction
+  }                                                             // End loop over x periodic direction
 }
 
 template<Equation equation>
 void Evaluator<equation>::evalEwaldReal(Cells &cells) {         // Evaluate queued Ewald real kernels
   startTimer("evalEwaldReal");                                  // Start timer
   Ci0 = cells.begin();                                          // Set begin iterator
-  for( C_iter Ci=cells.begin(); Ci!=cells.end(); ++Ci ) {       // Loop over cells
-    while( !listP2P[Ci-Ci0].empty() ) {                         //  While M2P interaction list is not empty
-      C_iter Cj = listP2P[Ci-Ci0].back();                       //   Set source cell iterator
+#pragma omp parallel for
+  for( int i=0; i<int(cells.size()); ++i ) {                    // Loop over cells
+    C_iter Ci = Ci0 + i;                                        //  Target cell iterator
+    while( !listP2P[i].empty() ) {                              //  While M2P interaction list is not empty
+      C_iter Cj = listP2P[i].back();                            //   Set source cell iterator
       EwaldReal(Ci,Cj);                                         //   Perform Ewald real kernel
-      listP2P[Ci-Ci0].pop_back();                               //   Pop last element from M2P interaction list
+      listP2P[i].pop_back();                                    //   Pop last element from M2P interaction list
     }                                                           //  End while for M2P interaction list
   }                                                             // End loop over cells topdown
   listP2P.clear();                                              // Clear interaction lists

kernel/CPUEwaldLaplace.cxx

 namespace {
 void dft(Ewalds &ewalds, Bodies &bodies, real R0) {
   real scale = M_PI / R0;
-  for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
+#pragma omp parallel for
+  for( int i=0; i<int(ewalds.size()); ++i ) {
+    E_iter E = ewalds.begin() + i;
     E->REAL = E->IMAG = 0;
     for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
       real th = 0;
 
 void idft(Ewalds &ewalds, Bodies &bodies, real R0) {
   real scale = M_PI / R0;
-  for( B_iter B=bodies.begin(); B!=bodies.end(); ++B ) {
+#pragma omp parallel for
+  for( int i=0; i<int(bodies.size()); ++i ) {
+    B_iter B = bodies.begin() + i;
     vec<4,real> TRG = 0;
     for( E_iter E=ewalds.begin(); E!=ewalds.end(); ++E ) {
       real th = 0;
 
 template<>
 void Kernel<Laplace>::EwaldReal(C_iter Ci, C_iter Cj) const {   // Ewald real part on CPU
-  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
+  for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
+    B_iter Bi = Ci->LEAF + i;                                   //  Target body iterator
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
       vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       for( int d=0; d<3; d++ ) {                                //   Loop over dimensions

kernel/CPUP2P.cxx

 template<>
 void Kernel<Laplace>::P2P(C_iter Ci, C_iter Cj) const {         // Laplace P2P kernel on CPU
 #ifndef SPARC_SIMD
-  for( B_iter Bi=Ci->LEAF; Bi!=Ci->LEAF+Ci->NDLEAF; ++Bi ) {    // Loop over target bodies
+#pragma omp parallel for
+  for( int i=0; i<Ci->NDLEAF; ++i ) {                           // Loop over target bodies
+    B_iter Bi = Ci->LEAF+i;                                     //  Target body iterator
     real P0 = 0;                                                //  Initialize potential
     vect F0 = 0;                                                //  Initialize force
     for( B_iter Bj=Cj->LEAF; Bj!=Cj->LEAF+Cj->NDLEAF; ++Bj ) {  //  Loop over source bodies
-      vect dist = Bi->X - Bj->X -Xperiodic;                     //   Distance vector from source to target
+      vect dist = Bi->X - Bj->X - Xperiodic;                    //   Distance vector from source to target
       real R2 = norm(dist) + EPS2;                              //   R^2
       real invR2 = 1.0 / R2;                                    //   1 / R^2
       if( R2 == 0 ) invR2 = 0;                                  //   Exclude self interaction

kernel/GPUEvaluator.cxx

 
 template<Equation equation>
 void Evaluator<equation>::evalP2P(Bodies &ibodies, Bodies &jbodies, bool onCPU) {// Evaluate all P2P kernels
-  startTimer("evalP2P");                                        // Start timer
+  int prange = getPeriodicRange();                              // Get range of periodic images
+  int numImages = pow(2 * prange + 1,3);                        // Get number of periodic images
   int numIcall = int(ibodies.size()-1)/MAXBODY+1;               // Number of icall loops
-  int numJcall = int(jbodies.size()-1)/MAXBODY+1;               // Number of jcall loops
+  int numJcall = int(jbodies.size()-1)/(MAXBODY/numImages)+1;   // Number of jcall loops
   int ioffset = 0;                                              // Initialzie offset for icall loops
   Cells cells(2);                                               // Two cells to put target and source bodies
   for( int icall=0; icall!=numIcall; ++icall ) {                // Loop over icall
     int joffset = 0;                                            //  Initialize offset for jcall loops
     for( int jcall=0; jcall!=numJcall; ++jcall ) {              //  Loop over jcall
       B_iter Bj0 = jbodies.begin()+joffset;                     //  Set source bodies begin iterator
-      B_iter BjN = jbodies.begin()+std::min(joffset+MAXBODY,int(jbodies.size()));// Set source bodies end iterator
+      B_iter BjN = jbodies.begin()+std::min(joffset+MAXBODY/numImages,int(jbodies.size()));// Set source bodies end iterator
       cells[1].LEAF = Bj0;                                      //  Iterator of first source leaf
       cells[1].NDLEAF = BjN-Bj0;                                //  Number of source leafs
       C_iter Ci = cells.begin(), Cj = cells.begin()+1;          //  Iterator of target and source cells
       if( onCPU ) {                                             //  If calculation is to be done on CPU
-        Xperiodic = 0;                                          //   Set periodic coordinate offset
-        P2P(Ci,Cj);                                             //   Perform P2P kernel on CPU
+        for( int ix=-prange; ix<=prange; ++ix ) {               //   Loop over x periodic direction
+          for( int iy=-prange; iy<=prange; ++iy ) {             //    Loop over y periodic direction
+            for( int iz=-prange; iz<=prange; ++iz ) {           //     Loop over z periodic direction
+              Xperiodic[0] = ix * 2 * R0;                       //      Shift x position
+              Xperiodic[1] = iy * 2 * R0;                       //      Shift y position
+              Xperiodic[2] = iz * 2 * R0;                       //      Shift z position
+              P2P(Ci,Cj);                                       //      Perform P2P kernel on CPU
+            }                                                   //     End loop over z periodic direction
+          }                                                     //    End loop over y periodic direction
+        }                                                       //   End loop over x periodic direction
       } else {                                                  //  If calculation is to be done on GPU
         constHost.push_back(2*R0);                              //   Copy domain size to GPU buffer
-        for( B_iter B=Bj0; B!=BjN; ++B ) {                      //   Loop over source bodies
-          sourceHost.push_back(B->X[0]);                        //   Copy x position to GPU buffer
-          sourceHost.push_back(B->X[1]);                        //   Copy y position to GPU buffer
-          sourceHost.push_back(B->X[2]);                        //   Copy z position to GPU buffer
-          sourceHost.push_back(B->SRC);                         //   Copy source value to GPU buffer
-        }                                                       //   End loop over source bodies
+        for( int ix=-prange; ix<=prange; ++ix ) {               //   Loop over x periodic direction
+          for( int iy=-prange; iy<=prange; ++iy ) {             //    Loop over y periodic direction
+            for( int iz=-prange; iz<=prange; ++iz ) {           //     Loop over z periodic direction
+              for( B_iter B=Bj0; B!=BjN; ++B ) {                //      Loop over source bodies
+                sourceHost.push_back(B->X[0] + ix * 2 * R0);    //       Copy x position to GPU buffer
+                sourceHost.push_back(B->X[1] + iy * 2 * R0);    //       Copy y position to GPU buffer
+                sourceHost.push_back(B->X[2] + iz * 2 * R0);    //       Copy z position to GPU buffer
+                sourceHost.push_back(B->SRC);                   //       Copy source value to GPU buffer
+              }                                                 //      End loop over source bodies
+            }                                                   //     End loop over z periodic direction
+          }                                                     //    End loop over y periodic direction
+        }                                                       //   End loop over x periodic direction
         int key = 0;                                            //   Initialize key to range of leafs in source cells
         int blocks = (BiN - Bi0 - 1) / THREADS + 1;             //   Number of thread blocks needed for this target cell
         for( int i=0; i!=blocks; ++i ) {                        //   Loop over thread blocks
         }                                                       //   End loop over thread blocks
         rangeHost.push_back(1);                                 //   Save size of interaction list
         rangeHost.push_back(0);                                 //   Set begin index of leafs
-        rangeHost.push_back(BjN-Bj0);                           //   Set number of leafs
+        rangeHost.push_back((BjN-Bj0)*numImages);               //   Set number of leafs
         rangeHost.push_back(Icenter);                           //   Set periodic image flag
         for( B_iter B=Bi0; B!=BiN; ++B ) {                      //   Loop over target bodies
           targetHost.push_back(B->X[0]);                        //    Copy x position to GPU buffer
         targetHost.clear();                                     //   Clear target vector
         sourceHost.clear();                                     //   Clear source vector
       }                                                         //  Endif for CPU/GPU switch
-      joffset += MAXBODY;                                       //  Increment jcall offset
+      joffset += MAXBODY/numImages;                             //  Increment jcall offset
     }                                                           // End loop over jcall
     ioffset += MAXBODY;                                         // Increment icall offset
   }                                                             // End loop over icall
-  stopTimer("evalP2P");                                         // Stop timer
 }
 
 template<Equation equation>

kernel/GPUSphericalLaplace.cu

   gpureal x = cosf(alpha);                                      // x = cos(alpha)
   gpureal y = sinf(alpha);                                      // y = sin(alpha)
   gpureal rho_1 = 1 / rho;                                      // 1 / rho
-  for( int l=threadIdx.x; l<(P+1)*P/2; l+=THREADS ) {           // Loop over coefficients in Ynm
+  for( int l=threadIdx.x; l<(2*P+1)*P; l+=THREADS ) {           // Loop over coefficients in Ynm
     gpureal fact = 1;                                           //  Initialize 2 * m + 1
     gpureal pn = 1;                                             //  Initialize Legendre polynomial Pn
     gpureal rhom = rho_1;                                       //  Initialize rho^(-m-1)
   k = threadIdx.x - k;
   if( threadIdx.x >= NTERM ) j = k = 0;
   gpureal ajk = ODDEVEN(j) * rsqrtf(factShrd[j-k] * factShrd[j+k]);
-  for( int n=0; n<P-j; ++n ) {
+  for( int n=0; n<P; ++n ) {
     for( int m=-n; m<0; ++m ) {
       int jnkm = (j + n) * (j + n + 1) / 2 - m + k;
       gpureal ere = cosf((m - k) * beta);
   gpureal target[2] = {0, 0};
   __shared__ gpureal sourceShrd[2*THREADS];
   __shared__ gpureal factShrd[2*P];
-  __shared__ gpureal YnmShrd[NTERM];
+  __shared__ gpureal YnmShrd[4*NTERM];
   gpureal fact = EPS;
   for( int i=0; i<2*P; ++i ) {
     factShrd[i] = fact;

mr3/vtgrape.c.orig

-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <math.h>
-#include <string.h>
-#include <pthread.h>
-#include <sys/time.h>
-
-#include "vtgrape.h"
-#include "md.h"            // application specific include file
-#include "vtgrapeproto.h"
-//#include "md.c"
-
-void lj_mother(void *);
-void ljpot_mother(void *);
-void grav_mother(void *);
-void gravpot_mother(void *);
-void real_mother(void *);
-void realpot_mother(void *);
-void wave_mother(void *);
-void wavepot_mother(void *);
-void nacl_mother(void *);
-void r1_mother(void *);
-void rsqrt_mother(void *);
-void mul_mother(void *);
-void gravfb_mother(void *);
-void gravpotfb_mother(void *);
-void ljfb_mother(void *);
-void ljpotfb_mother(void *);
-void realfb_mother(void *);
-void realpotfb_mother(void *);
-#ifdef MD_CELLINDEX
-void ljfbci_mother(void *);
-void ljpotfbci_mother(void *);
-void realci_mother(void *);
-void realpotci_mother(void *);
-void realfbci_mother(void *);
-void realpotfbci_mother(void *);
-void realljfbci_mother(void *);
-void realljfbci2_mother(void *);
-void realljpotfbci_nojdiv_mother(void *);
-void realljpotfbci_mother(void *);
-#endif
-
-
-typedef union {
-  int i;
-  float f;
-} FI;
-
-typedef union {
-  struct{
-    FI fi0;
-    FI fi1;
-  } fi2;
-  double d;
-} DI2;
-
-#if MD_PERIODIC_FIXED==1
-#define COPY_POS(vgvec,pos,volume_1) \
-        /*di2.d=(pos)*xmax_1+0x180000;*/\
-        di2.d=(pos)*(volume_1)+0x180000;\
-        (vgvec)=di2.fi2.fi0.f;\
-        /*printf("in COPY_POS : pos=%e xmax_1=%e vgvec=%08x\n",pos,xmax_1,vgvec);*/
-#define COPY_POS_INT(vgvec,pos,volume_1) \
-        di2.d=(pos)*(volume_1)+0x180000;\
-        (vgvec)=di2.fi2.fi0.i;\
-        /*printf("in COPY_POS_INT : pos=%e xmax_1=%e vgvec=%08x\n",pos,xmax_1,vgvec);*/
-#define COPY_POS_DEFINE \
-        /*double xmax_1=1.0/unit->xmax;*/\
-        double volume_1[3]={1.0/unit->volume[0],1.0/unit->volume[1],1.0/unit->volume[2]};\
-        DI2 di2;
-#else
-#define COPY_POS(vgvec,pos,dum) \
-        (vgvec)=(float)(pos);
-#define COPY_POS_INT(vgvec,pos,dum) \
-        (vgvec)=(float)(pos);
-#define COPY_POS_DEFINE \
-        double volume_1[3]={1.0/unit->volume[0],1.0/unit->volume[1],1.0/unit->volume[2]};
-#endif
-
-
-static double  Coulomb_vdw_factor=1.0;
-static VG_UNIT *MR3_unit;
-#ifdef MD_CELLINDEX
-static int Ldim[3]={1,1,1};
-#endif
-
-#ifdef MD_GENERATE_EMUTABLE
-static float *Emutable_r1=NULL;
-static float *Emutable_rsqrt=NULL;
-#endif
-#ifdef MD_MEASURE_TIME
-double Time[MD_MEASURE_TIME][3];
-                  /* [][0]-laptime, [][1]-sprittime, [][2]-sum
-		      0 -  9 : 
-		     10 - 19 :
-                     20 - 29 : MR3calccoulomb_vdw_nlist_ij_emu_work
-                                 depending on tblno
-		                 0 - coulomb force
-				 1 - coulomb potential
-				 6 - real-space force
-				 7 - real-space potential
-                     30 - 39 : MR3calccoulomb_vdw_ij_ci_virial
-		                 0 - total
-                                 1 - before assign cell
-                                 2 - assign cell
-                                 3 - copy positions to cell
-				 4 - other setup on cell
-				 5 - copy j data to GPU
-				 6 - copy i data to GPU
-				 7 - starting kernel on GPU
-				 8 - wait before copying result
-				 9 - copy result
-                     40 - 49 : 
-                                 0 - total mr3calccoulomb_vdw_ij_ci_exlist_
-                                 1 - total MR3calccoulomb_vdw_ij_ci_exlist
-				 2 -         MR3calccoulomb_vdw_ij_ci_virial
-				 3 -         MR3calccoulomb_vdw_nlist_ij_emu
-                                 4 - total mr3calccoulomb_vdw_exlist_
-                                 5 -         before GPU call
-                                 6 -         calling MR3calccoulomb_vdw_exlist
-                                 7 -                  
-				 8 -         MR3calccoulomb_vdw_ij_ci
-                     50 - 59 : 
-                                 0 - total MR3calccoulomb_vdw_ij_exlist
-				 1 -         nonoverlap thread : MR3calccoulomb_vdw_ij_exlist_core
-				 2 -         overlap thread : total
-                                 3 -     potential emu : MR3calccoulomb_vdw_nlist_ij_emu in MR3calccoulomb_vdw_ij_exlist_core
-				 4 -     potentail GPU : this part is divided into 60-69
-				 5 -     force     GPU : this part is divided into 60-69
-                                 6 -     force     emu : 
-				 7 -     force copy    : 
-                     60 - 69 :
-                                 0 - total MR3calccoulomb_vdw_ij_ci_old_09
-                                 1 - before assign cell
-				 2 - assign cell
-				 3 - copy positions to cell
-				 4 - other setup on cell
-				 5 - copy j data to GPU
-				 6 - copy i data to GPU
-				 7 - starting kernel on GPU
-				 8 - wait before copying result
-				 9 - copy result
-		  */
-#endif
-
-
-static void vg_get_cputime(double *laptime, double *sprittime)
-{
-  struct timeval tv;
-  struct timezone tz;
-  double sec,microsec;
-
-  gettimeofday(&tv, &tz);
-  sec=tv.tv_sec;
-  microsec=tv.tv_usec;
-
-  *sprittime = sec + microsec * 1e-6 - *laptime;
-  *laptime = sec + microsec * 1e-6;
-  //  printf("    vg_get_cputime is called: ltime=%e stime=%e\n",*laptime,*sprittime);
-}
-
-
-#ifdef MD_MEASURE_TIME
-static void vg_start_timer(int timerindex)
-{
-  vg_get_cputime(&Time[timerindex][0], &Time[timerindex][1]);
-}
-
-
-static void vg_stop_and_accumulate_timer(int timerindex)
-{
-  vg_get_cputime(&Time[timerindex][0], &Time[timerindex][1]);
-  Time[timerindex][2]+=Time[timerindex][1];
-}
-#endif
-
-
-void *MR3_my_malloc2(size_t n, char *s)
-{
-  void *ret;
-
-#if 0
-  {
-    static int count=0;
-    if(count<34){
-      printf("** count=%d doubling the allocation **\n",count);
-      n*=2;
-    }
-    count++;
-  }
-#endif
-  printf("Allocating %d bytes for %s\n",(int)n,s);
-  if(n==0){
-    fprintf(stderr,"** warning : mallocing %d **\n",(int)n);
-    //    sleep(1000);
-    MR3_exit(1);
-  }
-  ret=malloc(n);
-  printf("Allocated pointer=%016llx which is %s\n",
-	 (unsigned long long)ret,s);
-
-  return ret;
-}
-
-
-void MR3_my_free2(void **p, char *s)
-{
-  printf("Freeing pointer=%016llx which is %s\n",
-	 (unsigned long long)(*p),s);
-  if(*p==NULL){
-    printf("** warning : pointer=%s is NULL before free **\n",s);
-  }
-  free(*p);
-  *p=NULL;
-}
-
-
-VG_UNIT *m3_get_unit(void)
-{
-  return MR3_unit;
-}
-
-
-#if 1
-void vg_exit(int ret)
-{
-  exit(ret);
-}
-#else
-int vg_exit(int ret)
-{
-  return ret;
-}
-#endif
-
-
-VG_UNIT *vg_allocate_unit(void (*function)(void *))
-{
-  int i;
-  VG_UNIT *unit;
-  char *s;
-
-#ifdef MD_QAUNION_ATYPEBIT
-  if((1<<MD_QAUNION_ATYPEBIT)*(1<<MD_QAUNION_ATYPEBIT)
-     <VG_MINIMUM_ATYPE_BLOCK){
-    fprintf(stderr,"** error : (2^MD_QAUNION_ATYPEBIT)^2 must not be lower than VG_MINIMUM_ATYPE_BLOCK **\n");
-    vg_exit(1);
-  }
-#endif
-  if((unit=(VG_UNIT *)MR3_malloc_pointer(sizeof(VG_UNIT),"*vg_allocate_unit"))==NULL){
-    //    fprintf(stderr,"** error : can't malloc unit **\n")
-    return NULL;
-  }
-  unit->deviceid=0;
-  s=getenv("VG_DEVICEID");
-  if(s!=NULL){
-    sscanf(s,"%d",&(unit->deviceid));
-    printf("VG_DEVICEID is set %d\n",unit->deviceid);
-  }
-  unit->nj=unit->ni=unit->nati=unit->natj=unit->nf=0;
-  unit->jvectors=unit->ivectors=unit->matrices=NULL;
-  unit->scalers=unit->fvectors=unit->pscalers=NULL;
-  unit->calculate[0]=function;
-  for(i=1;i<VG_MAX_FUNCTIONS;i++) unit->calculate[i]=NULL;
-  unit->r1=NULL;
-  unit->rsqrt=NULL;
-  //  unit->fvec=NULL;
-  unit->fthread=NULL;
-  unit->thread=0;
-  unit->ni_overlap=0;
-  unit->potc=0.0;
-  unit->potv=0.0;
-  unit->rcut2=0.0;
-  unit->cuton=CUTON_DEFAULT;
-  unit->debug_flag=0;
-  unit->jsize=unit->isize=unit->fsize=0;
-
-  // for GPU overlap
-  unit->gpuoverlapflag=unit->function_index=0;
-#ifdef MD_QAUNION_ATYPEBIT
-  //  unit->scaleqi_1=unit->scaleqj_1=0.0;
-#endif
-
-  if(unit->scalers==NULL){
-    unit->ssize=sizeof(VG_SCALER);
-    if((unit->scalers=(void *)MR3_malloc_pointer(unit->ssize,"vg_allocate_unit"))==NULL){
-      fprintf(stderr,"** error : can't malloc scalers in vg_allocate_unit **\n");
-      vg_exit(1);
-    }
-  }
-
-  return unit;
-}
-
-
-void vg_free_unit(VG_UNIT *unit)
-{
-  MR3_free_pointer(unit->jvectors,"jvectors in vg_free_unit");
-  MR3_free_pointer(unit->ivectors,"ivectors in vg_free_unit");
-  MR3_free_pointer(unit->matrices,"matrices in vg_free_unit");
-  MR3_free_pointer(unit->scalers,"scalers in vg_free_unit");
-  MR3_free_pointer(unit->fvectors,"fvectoers in vg_free_unit");
-  MR3_free_pointer(unit->pscalers,"pscalers in vg_free_unit");
-  MR3_free_pointer(unit->r1,"r1 in vg_free_unit");
-  MR3_free_pointer(unit->rsqrt,"rsqrt in vg_free_unit");
-  //  MR3_free_pointer(unit->fvec,"fvec in vg_free_unit");
-  MR3_free_pointer(unit->fthread,"fthread in vg_free_unit");
-  MR3_free_pointer(unit,"unit in vg_free_unit");
-}
-
-
-void vg_set_function(VG_UNIT *unit, int function_index, void (*function)(void *))
-{
-  if(function_index>=VG_MAX_FUNCTIONS){
-    fprintf(stderr,"** error : function_index=%d is out of range **\n",
-	    function_index);
-    vg_exit(1);
-  }
-  unit->calculate[function_index]=function;
-}
-
-
-void vg_set_vectors(VG_UNIT *unit, int nj, void *jvectors)
-{
-  int size;
-
-  printf("** warning: free and malloc of jvectors should be modified **\n");
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  unit->nj=nj;
-  //  fprintf(stderr,"in vg_set_vectors: unit->jvectors=%x before free\n",unit->jvectors);
-  MR3_free_pointer(unit->jvectors,"vg_set_vectors");
-  if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-  //  fprintf(stderr,"in vg_set_vectors: unit->jvectors=%x after malloc %d\n",unit->jvectors,size);
-  memcpy(unit->jvectors,jvectors,size);
-}
-
-
-void vg_set_matrices(VG_UNIT *unit, int nati, int natj, void *matrices)
-{
-  int size;
-
-  if(nati*natj>VG_MINIMUM_ATYPE_BLOCK){
-    fprintf(stderr,"** error : nati*natj>VG_MINIMUM_ATYPE_BLOCK(%d) **\n",
-	    VG_MINIMUM_ATYPE_BLOCK);
-    vg_exit(1);
-  }
-  size=sizeof(VG_MATRIX)*MATRIX_ROUNDUP(nati*natj);
-  unit->nati=nati;
-  unit->natj=natj;
-  MR3_free_pointer(unit->matrices,"vg_set_matrices");
-  if((unit->matrices=(void *)MR3_malloc_pointer(size,"vg_set_matrices"))==NULL){
-    fprintf(stderr,"** error : can't malloc matrices in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-  memcpy(unit->matrices,matrices,size);
-}
-
-
-void vg_set_scalers(VG_UNIT *unit, void *scalers)
-{
-  unit->ssize=sizeof(VG_SCALER);
-  MR3_free_pointer(unit->scalers,"vg_set_scalers");
-  if((unit->scalers=(void *)MR3_malloc_pointer(unit->ssize,"vg_set_scalers"))==NULL){
-    fprintf(stderr,"** error : can't malloc scalers in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-  memcpy(unit->scalers,scalers,unit->ssize);
-}
-
-
-void vg_set_pipeline_vectors(VG_UNIT *unit, int ni, void *ivectors)
-{
-  int size;
-
-  printf("** warning: free and malloc of ivectors should be modified **\n");
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-  //  if(ni==0){fprintf(stderr,"** error : ni=0 in vg_set_pipeline_vectors **\n");exit(1);}
-  unit->ni=ni;
-  MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors");
-  if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors"))==NULL){
-    fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-  memcpy(unit->ivectors,ivectors,size);
-}
-
-
-void vg_calculate(VG_UNIT *unit, int function_index, 
-		  int nf, void *fvectors, void *pscalers)
-{
-  int fsize,psize;
-
-  fsize=sizeof(VG_FVEC)*NF_ROUNDUP(nf);
-  unit->nf=nf;
-  MR3_free_pointer(unit->fvectors,"vg_calculate");
-  if((unit->fvectors=(void *)MR3_malloc_pointer(fsize,"vg_calculate"))==NULL){
-    fprintf(stderr,"** error : can't malloc fvectors in vg_calculate **\n");
-    vg_exit(1);
-  }
-  bzero(unit->fvectors,fsize);
-  psize=sizeof(VG_PSCALER);
-#ifdef MD_CELLINDEX
-  if(unit->pscalers==NULL){
-    if((unit->pscalers=(VG_PSCALER *)MR3_malloc_pointer(sizeof(VG_PSCALER),"vg_calculate"))==NULL){
-      fprintf(stderr,"** error : can't malloc unit->pscalers in vg_calculate **\n");
-      vg_exit(1);
-    }
-  }
-#else
-  MR3_free_pointer(unit->pscalers,"vg_calculate");
-  if((unit->pscalers=(void *)MR3_malloc_pointer(psize,"vg_calculate"))==NULL){
-    fprintf(stderr,"** error : can't malloc pscalers in vg_calculate **\n");
-    vg_exit(1);
-  }
-  bzero(unit->pscalers,psize);
-#endif
-  unit->calculate[function_index]((void *)unit);
-  memcpy(fvectors,unit->fvectors,fsize);
-  memcpy(pscalers,unit->pscalers,psize);
-}
-
-
-void vg_set_vectors_pos_charge_atype(VG_UNIT *unit, int nj,
-				     double pos[][3], double charge[],
-				     int atype[])
-{
-#if defined(MD_USE_QAUNION) && defined(MD_QAUNION_ATYPEBIT)
-  int i,j,size;
-  VG_JVEC *jvec;
-  double scaleq;
-  float scaleq_1;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  //  size*=2;printf("** size of j-particles is doubled in vg_set_vectors_pos_charge_atype\n");
-
-  unit->nj=nj;
-  if(unit->jvectors==NULL){
-    if(size<unit->jsize) size=unit->jsize;
-    unit->jsize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d jvectors\n",size);
-#endif
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"void vg_set_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->jsize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d jvectors\n",size);
-#endif
-    MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_charge_atype");
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-    unit->jsize=size;
-  }
-  vg_set_scaleq(nj,charge,&scaleq,&scaleq_1);
-  if(unit->scalers!=NULL){
-    ((VG_SCALER *)(unit->scalers))->scaleqj_1=scaleq_1;
-  }
-  else{
-    fprintf(stderr,"** error : unit->scalers=NULL **\n");
-    vg_exit(1);
-  }
-  for(i=0;i<nj;i++){
-    jvec=(VG_JVEC *)(unit->jvectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(jvec->r[j],pos[i][j],volume_1[j]);
-    }
-    jvec->qatype.atype=Q_DOUBLE_TO_INT(charge[i],scaleq);
-    jvec->qatype.atype|=atype[i] & MASK(MD_QAUNION_ATYPEBIT);
-  }
-#elif defined(MD_NACL)
-  int i,j,size;
-  VG_JVEC *jvec;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  unit->nj=nj;
-  if(unit->jvectors==NULL){
-    if(size<unit->jsize) size=unit->jsize;
-    unit->jsize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d jvectors\n",size);
-#endif
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->jsize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d jvectors\n",size);
-#endif
-    MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_charge");
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  for(i=0;i<nj;i++){
-    jvec=(VG_JVEC *)(unit->jvectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(jvec->r[j],pos[i][j],volume_1[j]);
-    }
-    jvec->q=(float)(charge[i]);
-#if MD_USE_LARGE_VAL==2
-    jvec->q*=(float)(LOWER_VAL_FACTOR_ORG);
-#endif
-    jvec->atype=atype[i];
-  }
-#else
-  fprintf(stderr,"** error : MD_QAUNION_ATYPEBIT must be defined for vg_set_vectors_pos_charge_atype **\n");
-  vg_exit(1);
-#endif
-}
-
-
-#ifdef MD_SIGMA_EPSION_IN_VGSTRUCT
-void vg_set_vectors_pos_halfsigma_sqrtepsilon(VG_UNIT *unit, int nj,
-					      double pos[][3],
-					      double halfsigma[], 
-					      double sqrtepsilon[])
-{
-  int i,j,size;
-  VG_JVEC *jvec;
-  COPY_POS_DEFINE;
-
-  printf("** warning: free and malloc of jvectors should be modified **\n");
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  unit->nj=nj;
-  MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_halfsigma_sqrtepsilon");
-  if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_halfsigma_sqrtepsilon"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge_atype **\n");
-    vg_exit(1);
-  }
-  for(i=0;i<nj;i++){
-    jvec=(VG_JVEC *)(unit->jvectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(jvec->r[j],pos[i][j],volume_1[j]);
-    }
-    jvec->halfsigma=(float)(halfsigma[i]);
-    jvec->sqrtepsilon=(float)(sqrtepsilon[i]);
-  }
-}
-#endif
-
-
-#ifdef MD_USE_QAUNION
-void vg_set_vectors_pos_charge(VG_UNIT *unit, int nj,
-			       double pos[][3], double charge[])
-{
-  int i,j,size;
-  VG_JVEC *jvec;
-#ifdef MD_QAUNION_ATYPEBIT
-  double scaleq;
-  float scaleq_1;
-#endif
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  //size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj)*2;printf("size is doubled in vg_set_vectors_pos_charge\n");
-  //  size=sizeof(VG_JVEC)*(NJ_ROUNDUP(nj)+VG_MINIMUM_PARTICLE_BLOCK_J);printf("size is modified in vg_set_vectors_pos_charge\n");
-
-  unit->nj=nj;
-  //  fprintf(stderr,"in vg_set_vectors_pos_charge: unit->jvectors=%x before free\n",unit->jvectors);
-#if 0
-  MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_charges");
-  if((unit->jvectors=(void *)MR3_malloc_pointer(size,"unit->jvectors in vg_set_vectors_pos_charge"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge **\n");
-    vg_exit(1);
-  }
-#endif
-#if 0
-  if(unit->jvectors==NULL && (unit->jvectors=(void *)MR3_malloc_pointer(size,"unit->jvectors in vg_set_vectors_pos_charge"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge **\n");
-    vg_exit(1);
-  }
-  fprintf(stderr,"** warning : malloc of unit->jvectors is skipped in vg_set_vectors_pos_charge **\n");
-#endif
-#if 1
-  if(unit->jvectors==NULL){
-    if(size<unit->jsize) size=unit->jsize;
-    unit->jsize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d jvectors\n",size);
-#endif
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"void vg_set_vectors_pos_charge"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->jsize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d jvectors\n",size);
-#endif
-    MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_charge");
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_charge"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_charge **\n");
-      vg_exit(1);
-    }
-    unit->jsize=size;
-  }
-#endif
-  //  fprintf(stderr,"in vg_set_vectors_pos_charge: unit->jvectors=%x after malloc %d\n",unit->jvectors,size);
-#ifdef MD_QAUNION_ATYPEBIT
-  vg_set_scaleq(nj,charge,&scaleq,&scaleq_1);
-  if(unit->scalers!=NULL){
-    ((VG_SCALER *)(unit->scalers))->scaleqj_1=scaleq_1;
-#if MD_USE_LARGE_VAL==2 || MD_USE_LARGE_VAL==20
-    //    ((VG_SCALER *)(unit->scalers))->scaleqj_1*=LOWER_VAL_FACTOR_ORG;
-#endif
-  }
-  else{
-    fprintf(stderr,"** error : unit->scalers=NULL **\n");
-    vg_exit(1);
-  }
-#endif
-  for(i=0;i<nj;i++){
-    jvec=(VG_JVEC *)(unit->jvectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(jvec->r[j],pos[i][j],volume_1[j]);
-    }
-#ifdef MD_QAUNION_ATYPEBIT
-    jvec->qatype.atype=Q_DOUBLE_TO_INT(charge[i],scaleq) | \
-      (MASK(MD_QAUNION_ATYPEBIT) & jvec->qatype.atype);
-#else
-    jvec->qatype.q=(float)(charge[i]);
-#if MD_USE_LARGE_VAL==2
-    jvec->qatype.q*=(float)(LOWER_VAL_FACTOR_ORG);
-#endif
-#endif
-  }
-}
-
-
-void vg_set_vectors_pos_atype(VG_UNIT *unit, int nj,
-			      double pos[][3], int atype[])
-{
-  int i,j,size;
-  VG_JVEC *jvec;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj);
-  //  size=sizeof(VG_JVEC)*NJ_ROUNDUP(nj)*2;printf("size is doubled in vg_set_vectors_pos_charge\n");
-  //  size=sizeof(VG_JVEC)*(NJ_ROUNDUP(nj)+VG_MINIMUM_PARTICLE_BLOCK_J);printf("size is modified in vg_set_vectors_pos_charge\n");
-
-  unit->nj=nj;
-  //  fprintf(stderr,"in vg_set_vectors_pos_atype: unit->jvectors=%x before free\n",unit->jvectors);
-#if 0
-  MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_atype");
-  if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_atype"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_atype **\n");
-    vg_exit(1);
-  }
-#endif
-#if 0
-  if(unit->jvectors==NULL && (unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_atype"))==NULL){
-    fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_atype **\n");
-    vg_exit(1);
-  }
-  fprintf(stderr,"** warning : malloc of unit->jvectors is skipped in vg_set_vectors_pos_atype **\n");
-#endif
-#if 1
-  if(unit->jvectors==NULL){
-    if(size<unit->jsize) size=unit->jsize;
-    unit->jsize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d jvectors\n",size);
-#endif
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->jsize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d jvectors\n",size);
-#endif
-    MR3_free_pointer(unit->jvectors,"vg_set_vectors_pos_atype");
-    if((unit->jvectors=(void *)MR3_malloc_pointer(size,"vg_set_vectors_pos_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc jvectors in vg_set_vectors_pos_atype **\n");
-      vg_exit(1);
-    }
-    unit->jsize=size;
-  }
-#endif
-  //  fprintf(stderr,"in vg_set_vectors_pos_atype: unit->jvectors=%x after malloc %d\n",unit->jvectors,size);
-  for(i=0;i<nj;i++){
-    jvec=(VG_JVEC *)(unit->jvectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(jvec->r[j],pos[i][j],volume_1[j]);
-      //      printf("  j=%d pos[%d]=%e jvec=%08x\n",i,j,pos[i][j],jvec->r[j]);
-    }
-#ifdef MD_QAUNION_ATYPEBIT
-    jvec->qatype.atype=atype[i] | \
-      (~(MASK(MD_QAUNION_ATYPEBIT)) & jvec->qatype.atype);
-#else
-    jvec->qatype.atype=atype[i];
-#endif
-  }
-}
-#endif
-
-
-void vg_set_matrices_gscales_rscales(VG_UNIT *unit, int nati_org, int natj_org, 
-				     double gscales[], double rscales[])
-{
-  int i,j,size;
-  VG_MATRIX *mat;
-  int nati=nati_org,natj=natj_org;
-
-#if VDW_SHIFT>=1
-  double rcut2;
-  if(unit->rcut2==0.0){
-#if 0
-    printf("** warning : unit->rcut2 is not set. using default %f **\n",
-	   MD_LJ_R2MAX);
-    unit->rcut2=MD_LJ_R2MAX;
-#else
-    fprintf(stderr,"** error : unit->rcut2 must be set **\n");
-    vg_exit(1);
-#endif
-  }
-  rcut2=unit->rcut2;
-#endif
-#if defined(MD_CELLINDEX) && 0
-  nati++;
-  natj++;
-#endif
-  if(nati*natj>VG_MINIMUM_ATYPE_BLOCK){
-    fprintf(stderr,"** error : nati*natj>VG_MINIMUM_ATYPE_BLOCK(%d) **\n",
-	    VG_MINIMUM_ATYPE_BLOCK);
-    vg_exit(1);
-  }
-  size=sizeof(VG_MATRIX)*MATRIX_ROUNDUP(nati*natj);
-  //  size=sizeof(VG_MATRIX)*MATRIX_ROUNDUP(nati*natj)*2;printf("size is doubled in vg_set_matrices_gscales_rscales\n");
-
-  unit->nati=nati;
-  unit->natj=natj;
-  MR3_free_pointer(unit->matrices,"vg_set_matrices_gscales_rscales");
-  if((unit->matrices=(void *)MR3_malloc_pointer(size,"vg_set_matrices_gscales_rscales"))==NULL){
-    fprintf(stderr,"** error : can't malloc matrices in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-#if defined(MD_CELLINDEX) && 0
-#if 0 // atype=nati_org or atype=natj_org is dummy
-  for(i=0;i<nati;i++){
-    for(j=0;j<natj;j++){
-      mat=(VG_MATRIX *)(unit->matrices)+i*natj+j;
-      if(i==nati_org || j==natj_org){
-	mat->gscale=0.0f;
-	mat->rscale=1.0f;
-#if VDW_SHIFT>=1
-	mat->shift=1.0f;
-#endif
-      }
-      else{
-	mat->gscale=(float)(gscales[i*natj_org+j]);
-#if MD_USE_LARGE_VAL==2 || MD_USE_LARGE_VAL==20
-	mat->gscale*=(float)(LOWER_VAL_FACTOR_ORG); // Narumi's method v2
-#endif
-	mat->rscale=(float)(rscales[i*natj_org+j]);
-#if VDW_SHIFT==1
-	mat->shift=(float)(pow(rscales[i*natj_org+j]*rcut2,-3.0));
-#elif VDW_SHIFT==2
-	mat->shift=(float)((unit->cuton)*(unit->cuton));
-#endif
-      }
-    }
-  }
-#else // atype=0 is dummy
-  for(i=0;i<nati;i++){
-    for(j=0;j<natj;j++){
-      mat=(VG_MATRIX *)(unit->matrices)+i*natj+j;
-      if(i==0 || j==0){
-	mat->gscale=0.0f;
-	mat->rscale=1.0f;
-#if VDW_SHIFT>=1
-	mat->shift=1.0f;
-#endif
-      }
-      else{
-	mat->gscale=(float)(gscales[(i-1)*natj_org+j-1]);
-#if MD_USE_LARGE_VAL==2 || MD_USE_LARGE_VAL==20
-	mat->gscale*=(float)(LOWER_VAL_FACTOR_ORG); // Narumi's method v2
-#endif
-	mat->rscale=(float)(rscales[(i-1)*natj_org+j-1]);
-#if VDW_SHIFT==1
-	mat->shift=(float)(pow(rscales[(i-1)*natj_org+j-1]*rcut2,-3.0));
-#elif VDW_SHIFT==2
-	mat->shift=(float)((unit->cuton)*(unit->cuton));
-#endif
-      }
-    }
-  }
-#endif
-#else // else of MD_CELLINDEX
-  for(i=0;i<nati_org;i++){
-    for(j=0;j<natj_org;j++){
-      mat=(VG_MATRIX *)(unit->matrices)+i*natj+j;
-      mat->gscale=(float)(gscales[i*natj_org+j]);
-#if MD_USE_LARGE_VAL==2 || MD_USE_LARGE_VAL==20
-      mat->gscale*=(float)(LOWER_VAL_FACTOR_ORG); // Narumi's method v2
-#endif
-      mat->rscale=(float)(rscales[i*natj_org+j]);
-#if VDW_SHIFT==1
-      mat->shift=(float)(pow(rscales[i*natj_org+j]*rcut2,-3.0));
-#elif VDW_SHIFT==2
-      mat->shift=(float)((unit->cuton)*(unit->cuton));
-#endif
-    }
-  }
-#endif // end of MD_CELLINDEX
-}
-
-
-void vg_set_pol_sigm_ipotro_pc_pd_zz(VG_UNIT *unit, int nati, int natj,
-				     double pol[], double sigm[], double ipotro[],
-				     double pc[], double pd[], double zz[])
-{
-#ifdef MD_NACL
-  int i,j,size;
-  VG_MATRIX *mat;
-
-  if(nati*natj>VG_MINIMUM_ATYPE_BLOCK){
-    fprintf(stderr,"** error : nati*natj>VG_MINIMUM_ATYPE_BLOCK(%d) **\n",
-	    VG_MINIMUM_ATYPE_BLOCK);
-    vg_exit(1);
-  }
-  size=sizeof(VG_MATRIX)*MATRIX_ROUNDUP(nati*natj);
-  unit->nati=nati;
-  unit->natj=natj;
-  MR3_free_pointer(unit->matrices,"vg_set_pol_sigm_ipotro_pc_pd_zz");
-  if((unit->matrices=(void *)MR3_malloc_pointer(size,"vg_set_pol_sigm_ipotro_pc_pd_zz"))==NULL){
-    fprintf(stderr,"** error : can't malloc matrices in vg_set_vectors **\n");
-    vg_exit(1);
-  }
-  for(i=0;i<nati;i++){
-    for(j=0;j<natj;j++){
-      mat=(VG_MATRIX *)(unit->matrices)+i*natj+j;
-      //      mat->gscale=(float)(gscales[i*natj+j]);
-      //      mat->rscale=(float)(rscales[i*natj+j]);
-      mat->pol=(float)(pol[i*natj+j]);
-      mat->sigm=(float)(sigm[i*natj+j]);
-      mat->ipotro=(float)(ipotro[i*natj+j]);
-      mat->pc=(float)(pc[i*natj+j]);
-      mat->pd=(float)(pd[i*natj+j]);
-      mat->zz=(float)(zz[i*natj+j]);
-    }
-  }
-#else
-  fprintf(stderr,"**  error : MD_NACL should be defined in md.h **\n");
-  vg_exit(1);
-#endif
-}
-
-
-void vg_set_scalers_volume_alpha(VG_UNIT *unit, double volume[3], double alpha)
-{
-  int i;
-  VG_SCALER *scal;
-
-  if(unit->scalers==NULL){
-    unit->ssize=sizeof(VG_SCALER);
-    //    unit->ssize=sizeof(VG_SCALER)*2;printf("size is doubled in vg_set_scalers_volume_alpha\n");
-
-    //    MR3_free_pointer(unit->scalers,"vg_set_scalers_volume_alpha");
-    if((unit->scalers=(void *)MR3_malloc_pointer(unit->ssize,"vg_set_scalers_volume_alpha"))==NULL){
-      fprintf(stderr,"** error : can't malloc scalers in vg_set_vectors **\n");
-      vg_exit(1);
-    }
-  }
-  scal=(VG_SCALER *)(unit->scalers);
-  for(i=0;i<3;i++) scal->volume[i]=(float)(volume[i]);
-  scal->alpha=(float)alpha;
-  scal->alphafac=alpha*Coulomb_vdw_factor;
-  scal->alpha3fac=alpha*alpha*alpha*Coulomb_vdw_factor;
-#if defined(COULOMB_SHIFT) || 0
-  scal->rcut21=(float)(1.0/unit->rcut2);
-#endif
-#if 1
-  {
-    static int ini=0;
-    if(ini==0){
-#ifdef MD_PRINT_WARN
-      printf("eps2 is set zero in vg_set_scalers_volume_alpha in vtgrape.c **\n");
-#endif
-      ini=1;
-    }
-  }
-  //  scal->eps2=0.0f;
-#endif
-}
-
-
-#ifdef MD_MATRIX_IN_SCALER
-void vg_set_scalers_volume_alpha_gscales_rscales(VG_UNIT *unit, double volume[3], 
-						 double alpha, int nat,
-						 double gscalesf[],
-						 double gscalesp[], double rscales[])
-{
-  int i;
-  VG_SCALER *scal;
-
-  if(unit->scalers==NULL){
-    unit->ssize=sizeof(VG_SCALER);
-    //    MR3_free_pointer(unit->scalers,"vg_set_scalers_volume_alpha_gscales_rscales");
-    if((unit->scalers=(void *)MR3_malloc_pointer(unit->ssize,"unit->scalers in vg_set_scalers_volume_alpha_gscales_rscales"))==NULL){
-      fprintf(stderr,"** error : can't malloc scalers in vg_set_vectors **\n");
-      vg_exit(1);
-    }
-  }
-  scal=(VG_SCALER *)(unit->scalers);
-  for(i=0;i<3;i++) scal->volume[i]=(float)(volume[i]);
-  scal->alpha=(float)alpha;
-  for(i=0;i<nat*nat;i++){
-    scal->gscalesf[i]=(float)(gscalesf[i]);
-    scal->gscalesp[i]=(float)(gscalesp[i]);
-    scal->rscales[i]=(float)(rscales[i]);
-  }
-#if 1
-  {
-    static int ini=0;
-    if(ini==0){
-#ifdef MD_PRINT_WARN
-      printf("eps2 is set zero in vg_set_scalers_volume_alpha in vtgrape.c **\n");
-#endif
-      ini=1;
-    }
-  }
-  //  scal->eps2=0.0f;
-#endif
-}
-#endif
-
-
-void vg_set_pipeline_vectors_pos_charge_atype(VG_UNIT *unit, int ni, 
-					      double pos[][3], double charge[],
-					      int atype[])
-{
-#if defined(MD_USE_QAUNION) && defined(MD_QAUNION_ATYPEBIT)
-  int i,j,size;
-  VG_IVEC *ivec;
-  double scaleq;
-  float scaleq_1;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-
-  unit->ni=ni;
-  //  printf("unit->ni=%d ni=%d in set_pipeline_vectors_pos_charge_atype\n",unit->ni,ni);
-  if(unit->ivectors==NULL){
-    if(size<unit->isize) size=unit->isize;
-    unit->isize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d ivectors\n",size);
-#endif
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->isize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d ivectors\n",size);
-#endif
-    MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_charge_atype");
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"void vg_set_pipeline_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-    unit->isize=size;
-  }
-  vg_set_scaleq(ni,charge,&scaleq,&scaleq_1);
-  if(unit->scalers!=NULL){
-    ((VG_SCALER *)(unit->scalers))->scaleqi_1=scaleq_1;
-#if MD_USE_LARGE_VAL==2 || MD_USE_LARGE_VAL==20
-    //    ((VG_SCALER *)(unit->scalers))->scaleqi_1*=LOWER_VAL_FACTOR_ORG;
-#endif
-  }
-  else{
-    fprintf(stderr,"** error : unit->scalers=NULL **\n");
-    vg_exit(1);
-  }
-  for(i=0;i<ni;i++){
-    ivec=(VG_IVEC *)(unit->ivectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(ivec->r[j],pos[i][j],volume_1[j]);
-    }
-    ivec->qatype.atype=Q_DOUBLE_TO_INT(charge[i],scaleq);
-    ivec->qatype.atype|=atype[i] & MASK(MD_QAUNION_ATYPEBIT);
-  }
-#elif defined(MD_NACL)
-  int i,j,size;
-  VG_IVEC *ivec;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-  unit->ni=ni;
-
-  if(unit->ivectors==NULL){
-    if(size<unit->isize) size=unit->isize;
-    unit->isize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d ivectors\n",size);
-#endif
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->isize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d ivectors\n",size);
-#endif
-    MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_charge");
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors_pos_charge_atype **\n");
-      vg_exit(1);
-    }
-  }
-  for(i=0;i<ni;i++){
-    ivec=(VG_IVEC *)(unit->ivectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(ivec->r[j],pos[i][j],volume_1[j]);
-    }
-    ivec->q=(float)(charge[i]);
-    ivec->atype=atype[i];
-  }
-#else
-  fprintf(stderr,"** error : MD_QAUNION_ATYPEBIT must be defined for vg_set_pipeline_vectors_pos_charge_atype **\n");
-  vg_exit(1);
-#endif
-}
-
-
-#ifdef MD_SIGMA_EPSION_IN_VGSTRUCT
-void vg_set_pipeline_vectors_pos_halfsigma_sqrtepsilon(VG_UNIT *unit, int ni, 
-						       double pos[][3], 
-						       double halfsigma[],
-						       double sqrtepsilon[])
-{
-  int i,j,size;
-  VG_IVEC *ivec;
-  COPY_POS_DEFINE;
-
-  printf("** warning: free and malloc of ivectors should be modified **\n");
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-  unit->ni=ni;
-  MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_halfsigma_sqrtepsilon");
-  if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_halfsigma_sqrtepsilon"))==NULL){
-    fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors_pos_charge_atype **\n");
-    vg_exit(1);
-  }
-  for(i=0;i<ni;i++){
-    ivec=(VG_IVEC *)(unit->ivectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(ivec->r[j],pos[i][j],volume_1[j]);
-    }
-    ivec->halfsigma=(float)(halfsigma[i]);
-    ivec->sqrtepsilon=(float)(sqrtepsilon[i]);
-  }
-}
-#endif
-
-
-#ifdef MD_USE_QAUNION
-void vg_set_pipeline_vectors_pos_charge(VG_UNIT *unit, int ni, 
-					double pos[][3], double charge[])
-{
-  int i,j,size;
-  VG_IVEC *ivec;
-#ifdef MD_QAUNION_ATYPEBIT
-  double scaleq;
-  float scaleq_1;
-#endif
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-  //  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni)*2;printf("size is doubled in vg_set_pipeline_vectors_pos_charge\n");
-
-  unit->ni=ni;
-#if 0
-  MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_charge");
-  if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge"))==NULL){
-    fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors_pos_charge **\n");
-    vg_exit(1);
-  }
-#else
-  if(unit->ivectors==NULL){
-    if(size<unit->isize) size=unit->isize;
-    unit->isize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d ivectors\n",size);
-#endif
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_charge **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->isize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d ivectors\n",size);
-#endif
-    MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_charge");
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_charge"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_charge **\n");
-      vg_exit(1);
-    }
-    unit->isize=size;
-  }
-#endif
-#ifdef MD_QAUNION_ATYPEBIT
-  vg_set_scaleq(ni,charge,&scaleq,&scaleq_1);
-  if(unit->scalers!=NULL){
-    ((VG_SCALER *)(unit->scalers))->scaleqi_1=scaleq_1;
-  }
-  else{
-    fprintf(stderr,"** error : unit->scalers=NULL **\n");
-    vg_exit(1);
-  }
-#endif
-  for(i=0;i<ni;i++){
-    ivec=(VG_IVEC *)(unit->ivectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(ivec->r[j],pos[i][j],volume_1[j]);
-    }
-#ifdef MD_QAUNION_ATYPEBIT
-    ivec->qatype.atype=Q_DOUBLE_TO_INT(charge[i],scaleq) | \
-      (MASK(MD_QAUNION_ATYPEBIT) & ivec->qatype.atype);
-#else
-    ivec->qatype.q=(float)(charge[i]);
-#endif
-  }
-}
-
-
-void vg_set_pipeline_vectors_pos_atype(VG_UNIT *unit, int ni, 
-				       double pos[][3], int atype[])
-{
-  int i,j,size;
-  VG_IVEC *ivec;
-  COPY_POS_DEFINE;
-
-  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);
-  //  size=sizeof(VG_IVEC)*NI_ROUNDUP(ni);printf("size is doubled in vg_set_pipeline_vectors_pos_atype\n");
-
-  unit->ni=ni;
-#if 0
-  MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_atype");
-  if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_atype"))==NULL){
-    fprintf(stderr,"** error : can't malloc ivectors in vg_set_vectors_pos_atype **\n");
-    vg_exit(1);
-  }
-#else
-  if(unit->ivectors==NULL){
-    if(size<unit->isize) size=unit->isize;
-    unit->isize=size;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d ivectors\n",size);
-#endif
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_atype **\n");
-      vg_exit(1);
-    }
-  }
-  else if(size>unit->isize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d ivectors\n",size);
-#endif
-    MR3_free_pointer(unit->ivectors,"vg_set_pipeline_vectors_pos_atype");
-    if((unit->ivectors=(void *)MR3_malloc_pointer(size,"vg_set_pipeline_vectors_pos_atype"))==NULL){
-      fprintf(stderr,"** error : can't malloc ivectors in vg_set_pipeline_vectors_pos_atype **\n");
-      vg_exit(1);
-    }
-    unit->isize=size;
-  }
-#endif
-  for(i=0;i<ni;i++){
-    ivec=(VG_IVEC *)(unit->ivectors)+i;
-    for(j=0;j<3;j++){
-      COPY_POS(ivec->r[j],pos[i][j],volume_1[j]);
-    }
-#ifdef MD_QAUNION_ATYPEBIT
-    ivec->qatype.atype=atype[i] | \
-      (~(MASK(MD_QAUNION_ATYPEBIT)) & ivec->qatype.atype);
-#else
-    ivec->qatype.atype=atype[i];
-#endif
-  }
-}
-#endif
-
-
-void vg_convert_forces(VG_UNIT *unit, int nf, int function_index, double fi[][3])
-{
-  int i,j;
-  VG_FVEC *fvec;
-
-  for(i=0;i<nf;i++){
-    fvec=(VG_FVEC *)(unit->fvectors)+i;
-#ifdef ACCUMULATE_PAIR_FLOAT
-#if ACCUMULATE_PAIR_FLOAT==3
-    for(j=0;j<3;j++) fi[i][j]=((double *)(fvec->fi))[j];
-#else // else of ACCUMULATE_PAIR_FLOAT==3
-#if MD_USE_LARGE_VAL==1
-    for(j=0;j<3;j++) fi[i][j]=(double)(fvec->fi[j])-(double)LARGE_VAL+(double)(fvec->fi2[j]);
-#elif MD_USE_LARGE_VAL==2
-    for(j=0;j<3;j++){
-      fi[i][j]=(double)(fvec->fi[j])-(double)LARGE_VAL
-	+((int *)(fvec->fi2))[j]*((double)LOWER_VAL_FACTOR_1);
-      fi[i][j]*=(double)LOWER_VAL_FACTOR_1_ORG;// Narumi's method v2
-    }
-#elif MD_USE_LARGE_VAL==20
-    if(function_index==2 || function_index==3
-#ifdef MD_CELLINDEX
-       || function_index==12 || function_index==13
-#endif
-#ifdef MD_QAUNION_ATYPEBIT
-       || function_index==36 || function_index==37 
-       || function_index==46 || function_index==47
-#endif
-       ){
-      for(j=0;j<3;j++){
-	fi[i][j]=(double)(fvec->fi[j])-(double)LARGE_VAL
-	  +((int *)(fvec->fi2))[j]*((double)LOWER_VAL_FACTOR_1);
-	fi[i][j]*=(double)LOWER_VAL_FACTOR_1_ORG;// Narumi's method v2
-      }
-    }
-    else{
-      for(j=0;j<3;j++) fi[i][j]=(double)(fvec->fi[j])+(double)(fvec->fi2[j]);
-    }
-#else
-    for(j=0;j<3;j++) fi[i][j]=(double)(fvec->fi[j])+(double)(fvec->fi2[j]);
-    //    if(i<3) printf("i=%d fi=%20.12e %20.12e %20.12e fi2=%20.12e %20.12e %20.12e\n",i,fvec->fi[0],fvec->fi[1],fvec->fi[2],fvec->fi2[0],fvec->fi2[1],fvec->fi2[2]);
-#endif
-#endif // end of ACCUMULATE_PAIR_FLOAT==3
-#else
-    for(j=0;j<3;j++) fi[i][j]=(double)(fvec->fi[j]);
-#endif
-  }
-}
-
-
-void vg_calculate_forces_potentials(VG_UNIT *unit, int function_index,
-				    int nf, double fi[][3],
-				    double *potc, double *potv)
-{
-  int fsize,psize;
-  int i,j;
-  VG_FVEC *fvec;
-  VG_PSCALER *pscal;
-
-  //  printf("in vg_calculate_forces_potentials : tblno=%d\n",function_index);
-  fsize=sizeof(VG_FVEC)*NF_ROUNDUP(nf);
-  //  fsize=sizeof(VG_FVEC)*NF_ROUNDUP(nf)*2;printf("size is doubled in vg_calculate_forces_potentials\n");
-  unit->nf=nf;
-#if 0
-  //  printf("unit->fvectors=%x before free\n",unit->fvectors);
-  MR3_free_pointer(unit->fvectors,"vg_calculate_forces_potentials");
-  if((unit->fvectors=(void *)MR3_malloc_pointer(fsize,"vg_calculate_forces_potentials"))==NULL){
-    fprintf(stderr,"** error : can't malloc fvectors in vg_calculate **\n");
-    vg_exit(1);
-  }
-  //  printf("unit->fvectors=%x after malloc\n",unit->fvectors);
-#else
-  //  printf("unit->fvectors=%x in vg_calculate_potentials\n",unit->fvectors);
-  if(unit->fvectors==NULL){
-    //    printf("fsize=%d unit->fsize=%d nf=%d(*VG_FVEC=%d)\n",fsize,unit->fsize,nf,nf*sizeof(VG_FVEC));
-    if(fsize<unit->fsize) fsize=unit->fsize;
-    unit->fsize=fsize;
-#ifdef MD_PRINT_WARN
-    printf("allocating %d fvectors\n",fsize);
-#endif
-    if((unit->fvectors=(void *)MR3_malloc_pointer(fsize,"vg_calculate_forces_potentials"))==NULL){
-      fprintf(stderr,"** error : can't malloc fvectors in vg_calculate_forces_potentials **\n");
-      vg_exit(1);
-    }
-  }
-  else if(fsize>unit->fsize){
-#ifdef MD_PRINT_WARN
-    printf("reallocating %d fvectors\n",fsize);
-#endif
-    MR3_free_pointer(unit->fvectors,"vg_calculate_forces_potentials");
-    if((unit->fvectors=(void *)MR3_malloc_pointer(fsize,"vg_calculate_forces_potentials"))==NULL){
-      fprintf(stderr,"** error : can't malloc fvectors in vg_calculate_forces_potentials **\n");
-      vg_exit(1);
-    }
-    unit->fsize=fsize;
-  }
-#endif
-  bzero(unit->fvectors,fsize);
-  psize=sizeof(VG_PSCALER);
-  //  psize=sizeof(VG_PSCALER)*2;printf("size is doubled in vg_calculate_forces_potentials\n");
-
-#ifdef MD_CELLINDEX
-  if(unit->pscalers==NULL){
-    if((unit->pscalers=(VG_PSCALER *)MR3_malloc_pointer(sizeof(VG_PSCALER),"vg_calculate_forces_potentials"))==NULL){
-      fprintf(stderr,"** error : can't malloc unit->pscalers in vg_calculate **\n");
-      vg_exit(1);
-    }
-  }
-#else
-  MR3_free_pointer(unit->pscalers,"vg_calculate_forces_potentials");
-  if((unit->pscalers=(void *)MR3_malloc_pointer(psize,"vg_calculate_forces_potentials"))==NULL){
-    fprintf(stderr,"** error : can't malloc pscalers in vg_calculate **\n");
-    vg_exit(1);
-  }
-  bzero(unit->pscalers,psize);
-#endif
-  //  if(unit->debug_flag==0) 
-  //  printf("in vg_calculate_forces_potentials: tblno is added by 10\n",function_index+=10;
-  //  printf("in vg_calculate_forces_potentials: function_index=%d\n",function_index); 
-  unit->calculate[function_index]((void *)unit);
-  //  else debug_mother((void *)unit);
-
-#if defined(MD_PRINT_WARN) && 0
-  printf("  unit->gpuoverlapflag=%d in vg_calculate_forces_potentials\n",unit->gpuoverlapflag);
-#endif
-  if(unit->gpuoverlapflag==0){
-    vg_convert_forces(unit,nf,function_index,fi);
-  }
-  else{
-    unit->function_index=function_index;
-    unit->fthread=(double *)fi;
-  }
-  pscal=(VG_PSCALER *)(unit->pscalers);
-  *potc=(double)(pscal->potc);
-  *potv=(double)(pscal->potv);
-}
-
-
-static void vg_get_funcresult(int n_org, float r[], float r1[], int function_index)
-{
-#ifdef MD_USE_QAUNION
-  int i,j,ni,nj,n,l,nii;
-  VG_UNION_FI fi;
-  double vol[3]={256.0,256.0,256.0},rscale=1.0;
-  double potc,potv;
-  double *xi;
-  double *xj;
-  double *force;
-  double *qi;
-  double *qj;
-  VG_UNIT *unit=MR3_unit;
-
-  if(unit==NULL){
-    fprintf(stderr,"** error : unit is null **\n");
-    exit(1);
-  }
-  if(n_org<0) n=-n_org;
-  else        n=n_org;
-  
-  if((xi=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"void vg_get_funcresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc xi **\n");
-    exit(1);
-  }
-  if((xj=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"void vg_get_funcresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc xj **\n");
-    exit(1);
-  }
-  if((force=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"void vg_get_funcresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc force **\n");
-    exit(1);
-  }
-  if((qi=(double *)MR3_malloc_pointer(sizeof(double)*n,"void vg_get_funcresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc qi **\n");
-    exit(1);
-  }
-  if((qj=(double *)MR3_malloc_pointer(sizeof(double)*n,"void vg_get_funcresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc qj **\n");
-    exit(1);
-  }
-
-  ni=n;
-#if 1
-  nj=1;
-#else
-  nj=VG_MINIMUM_PARTICLE_BLOCK_I;
-  printf("**** nj is set %d\n",nj);
-#endif
-
-  if(n_org>0){
-    for(i=0;i<n;i++){
-      if(i<(1<<23)){
-	fi.f=1.0f;
-	fi.i=(fi.i & 0xff800000) | i;
-	//    printf("i=%d fi.i=%08x\n",i,fi.i);
-      }
-      else{
-	fi.f=2.0f;
-	fi.i=(fi.i & 0xff800000) | (i-(1<<23));
-      }
-      r[i]=fi.f;
-    }
-  }
-
-  for(i=0;i<ni;i++){
-    xi[i*3+0]=r[i];
-    xi[i*3+1]=xi[i*3+2]=0.0;
-    qi[i]=1.0;
-    force[i*3]=force[i*3+1]=force[i*3+2]=0.0;
-  }
-  for(i=0;i<VG_MINIMUM_PARTICLE_BLOCK_I;i++){
-    xj[i*3]=xj[i*3+1]=xj[i*3+2]=0.0;
-    qj[i]=0.0;
-  }
-  qj[0]=1.0;
-
-  for(l=0;l<(ni+(1<<23)-1)/(1<<23);l++){
-    int ii,niii;
-    nii=l*(1<<23)+(1<<23)>ni ? ni-l*(1<<23):(1<<23);
-    vg_set_vectors_pos_charge(unit,nj,(double (*)[3])xj,qj);
-    vg_set_scalers_volume_alpha(unit,vol,rscale);
-    for(ii=0;ii<nii;ii+=MD_MAX_I_PER_KERNEL){
-      niii=ii+MD_MAX_I_PER_KERNEL>nii ? nii-ii:MD_MAX_I_PER_KERNEL;
-      vg_set_pipeline_vectors_pos_charge(unit,niii,((double (*)[3])xi)+l*(1<<23)+ii,qi);
-      vg_calculate_forces_potentials(unit,function_index,niii,(double (*)[3])force+ii,&potc,&potv);
-    }
-    for(i=0;i<nii;i++){
-      r1[i+l*(1<<23)]=force[i*3];
-    }
-  }
-  /*
-  for(l=0;l<(ni+(1<<23)-1)/(1<<23);l++){
-    nii=l*(1<<23)+(1<<23)>ni ? ni-l*(1<<23):(1<<23);
-    vg_set_vectors_pos_charge(unit,nj,(double (*)[3])xj,qj);
-    vg_set_scalers_volume_alpha(unit,vol,rscale);
-    vg_set_pipeline_vectors_pos_charge(unit,nii,((double (*)[3])xi)+l*(1<<23),qi);
-    vg_calculate_forces_potentials(unit,function_index,nii,(double (*)[3])force,&potc,&potv);
-    for(i=0;i<nii;i++){
-      r1[i+l*(1<<23)]=force[i*3];
-    }
-    }*/
-  //  for(i=0;i<3;i++) printf("i=%d r1=%e\n",i,r1[i]);
-  MR3_free_pointer(xi,"void vg_get_funcresult");
-  MR3_free_pointer(xj,"void vg_get_funcresult");
-  MR3_free_pointer(qi,"void vg_get_funcresult");
-  MR3_free_pointer(qj,"void vg_get_funcresult");
-  MR3_free_pointer(force,"void vg_get_funcresult");
-#else
-  fprintf(stderr,"** MD_USE_QAUNION must be defined in vg_get_r1result **\n");
-#endif
-}
-
-
-void vg_get_r1result(int n_org, float r[], float r1[])
-{
-#if 1
-  vg_get_funcresult(n_org,r,r1,20);
-#else
-  vg_get_funcresult(n_org,r,r1,0);
-  printf("************** function_index is set 0\n");
-#endif
-}
-
-
-void vg_get_rsqrtresult(int n_org, float r[], float r1[])
-{
-  vg_get_funcresult(n_org,r,r1,21);
-  //  printf("************** function_index is set 20\n");
-}
-
-
-void vg_get_mulresult(int n_org, float r[], float q[], float rq[])
-{
-#ifdef MD_USE_QAUNION
-  int function_index=22;
-  int i,j,ni,nj,n,l,nii;
-  VG_UNION_FI fi;
-  double vol[3]={256.0,256.0,256.0},rscale=1.0;
-  double potc,potv;
-  double *xi;
-  double *xj;
-  double *force;
-  double *qi;
-  double *qj;
-  VG_UNIT *unit=MR3_unit;
-
-  if(n_org<0) n=-n_org;
-  else        n=n_org;
-  
-  if((xi=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"vg_get_mulresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc xi **\n");
-    exit(1);
-  }
-  if((xj=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"vg_get_mulresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc xj **\n");
-    exit(1);
-  }
-  if((force=(double *)MR3_malloc_pointer(sizeof(double)*n*3,"vg_get_mulresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc force **\n");
-    exit(1);
-  }
-  if((qi=(double *)MR3_malloc_pointer(sizeof(double)*n,"vg_get_mulresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc qi **\n");
-    exit(1);
-  }
-  if((qj=(double *)MR3_malloc_pointer(sizeof(double)*n,"vg_get_mulresult"))==NULL){
-    fprintf(stderr,"** error : can't malloc qj **\n");
-    exit(1);
-  }
-
-  ni=n;
-#if 1
-  nj=1;
-#else