1. Rio Yokota
  2. PetIGA-FMM

Commits

Huda Ibeid  committed 4342d6b

nbody kernels

  • Participants
  • Parent commits 2bf9469
  • Branches default

Comments (0)

Files changed (6)

File fmm/Poisson3D.c

View file
  • Ignore whitespace
 
   PetscInt dim = 3;
 
-  PetscInt N[3] = {16,16,16}, nN = 3; 
+  PetscInt N[3] = {5,5,5}, nN = 3; 
   PetscInt p[3] = { 1, 1, 1}, np = 3;
   PetscInt C[3] = {-1,-1,-1}, nC = 3;
   ierr = PetscOptionsBegin(PETSC_COMM_WORLD,"","Poisson Options","IGA");CHKERRQ(ierr);

File fmm/fmm2D.c

View file
  • Ignore whitespace
   ierr = MatSetUp(fmm->G);CHKERRQ(ierr);
 
   PetscReal eps = .2 / (1<<((int)(log2(info.mx-1))));
-//  FMM_Init(eps*eps, 0);
+  FMM_Init(eps*eps, 0);
   fmm->eps = eps;
   PetscFunctionReturn(0);
 }
   nb = ic; // this is ok, but we should improve memory allocation
   assert( nb == ic );
   
-  //FMM(nb,xm,ym,rhs,ni,xi,yi,ri);
-  nbodyG(nb,xm,ym,rhs,ni,xi,yi,ri,eps);
+  FMM(nb,xm,ym,rhs,ni,xi,yi,ri);
+  //nbodyG(nb,xm,ym,rhs,ni,xi,yi,ri,eps);
+
   //nbodyGn(nb,xm,ym,rhs,nb,xm,ym,um,dxe,dye,re,eps);
 
   for (i=0; i<nb; i++) {
   for (i=0; i<ni; i++) {
     ui[i] = 0;
   }
-  //FMM(ni,xi,yi,ui,ni,xi,yi,ri);
-  //FMM(ni,xi,yi,ui,nb,xm,ym,un);
-  nbodyG(ni,xi,yi,ui,ni,xi,yi,ri,eps);
-  nbodyG(ni,xi,yi,ui,nb,xm,ym,un,eps);
+  FMM(ni,xi,yi,ui,ni,xi,yi,ri);
+  FMM(ni,xi,yi,ui,nb,xm,ym,un);
+//  nbodyG(ni,xi,yi,ui,ni,xi,yi,ri,eps);
+//  nbodyG(ni,xi,yi,ui,nb,xm,ym,un,eps);
+
   //nbodyGn(ni,xi,yi,ui,nb,xm,ym,um,dxe,dye,re,eps);
 
   ic = 0;

File fmm/fmm3D.c

View file
  • Ignore whitespace
   PetscReal *xm = fmm->xm;
   PetscReal *ym = fmm->ym;
   PetscReal *zm = fmm->zm;
-//  PetscReal *re = fmm->re;
+  PetscReal *re = fmm->re;
   PetscReal eps = fmm->eps;
   VecGetSize(x,&i);
   ierr = VecGetArray(x,&xl);CHKERRQ(ierr);
   ierr = PetscMalloc(16*sizeof(PetscReal),&(fmm->vg));CHKERRQ(ierr);
   ierr = PetscMalloc(fmm->nb*sizeof(PetscReal),&(fmm->phi));CHKERRQ(ierr);
   ierr = PetscMalloc(fmm->nb*sizeof(PetscReal),&(fmm->dphi));CHKERRQ(ierr);
+  ierr = PetscMalloc(16*fmm->nb*sizeof(PetscReal),&(fmm->re));CHKERRQ(ierr);
+  ierr = PetscMalloc(fmm->nb*sizeof(PetscReal),&(fmm->D_1));CHKERRQ(ierr);
+  ierr = PetscMalloc(fmm->nb*sizeof(PetscReal),&(fmm->D_2));CHKERRQ(ierr);
+  ierr = PetscMalloc(fmm->ni*sizeof(PetscReal),&(fmm->D_3));CHKERRQ(ierr);
+  ierr = PetscMalloc(fmm->ni*sizeof(PetscReal),&(fmm->D_4));CHKERRQ(ierr);
 
   ierr = PetscMalloc3(fmm->ni,PetscReal,&(fmm->xi),
 		      fmm->ni,PetscReal,&(fmm->yi),
   ierr = PetscMalloc3(fmm->nb,PetscReal,&(fmm->rhs),
 		      fmm->nb,PetscReal,&(fmm->un),
 		      fmm->ni,PetscReal,&(fmm->ui));CHKERRQ(ierr);
+  ierr = PetscMalloc3(16*fmm->nb,PetscReal,&(fmm->xk),
+                      16*fmm->nb,PetscReal,&(fmm->yk),
+                      16*fmm->nb,PetscReal,&(fmm->zk));CHKERRQ(ierr);
   
   ierr = PetscMalloc3(fmm->nb,PetscReal,&(fmm->PCv00),
                       fmm->nb,PetscReal,&(fmm->PCv01),
 
 //TODO: check eps 
   PetscReal eps = 0.0000001;//.2 / (1<<((int)(log2(info.mx-1))));
-//  FMM_Init(eps*eps, 0);
+//  FMM_Init();
   fmm->eps = eps;
   PetscFunctionReturn(0);
 }
   PetscReal *xm  = fmm->xm;
   PetscReal *ym  = fmm->ym;
   PetscReal *zm  = fmm->zm;
+  PetscReal *xk  = fmm->xk;
+  PetscReal *yk  = fmm->yk;
+  PetscReal *zk  = fmm->zk;
   PetscReal *dxe = fmm->dxe;
   PetscReal *dye = fmm->dye;
   PetscReal *dze = fmm->dze;
   PetscReal *rhs = fmm->rhs;
   PetscReal *un  = fmm->un;
   PetscReal *ui  = fmm->ui;
+  PetscReal *re  = fmm->re;
+  PetscReal *D_1  = fmm->D_1;
+  PetscReal *D_2  = fmm->D_2;
+  PetscReal *D_3  = fmm->D_3;
+  PetscReal *D_4  = fmm->D_4;
   PetscReal  eps = fmm->eps;
 
   PetscInt i, j, k, ic = 0;
   PetscInt *BCT = fmm->BCT;
 
   PetscReal x21, x31, y21, y31, z21, z31;
-  PetscReal xk, yk, zk, D1, D2, del, ut;
+  PetscReal xk1, yk1, zk1, D1, D2, del, ut;
   PetscInt  N0 = (info.mx-1);
   PetscReal aa, bb, cc, dd, s1, s2, s3 = 1.0/sqrt(3.0), dl;
   PetscReal *jac = fmm->jac;
     ic += 1;
   }
 
+  j = 0;
+  for (k = 0; k < nb; k++){
+    for (i = 0; i < 16; i++){
+      ut     = tg[i]*(1.0-vg[i]);
+      xk[j]  = PCv00[k]*ut+PCv01[k]*vg[i]+PCv02[k];
+      yk[j]  = PCv10[k]*ut+PCv11[k]*vg[i]+PCv12[k];
+      zk[j]  = PCv20[k]*ut+PCv21[k]*vg[i]+PCv22[k];
+      re[j]  = (1.0-vg[i]);
+      j += 1;
+    }
+  }
+
   for(i = 0; i < nb; i++){
-    if (i < (2*N0*N0)){
+//    if (i < (2*N0*N0)){
       BCT[i] = 0;
       BCV[i] = nbodyG(xm[i],ym[i],zm[i],0.0,0.0,1.5,eps);
-    }
-    else{
-      BCT[i] = 1;
-      BCV[i] = nbodyGn(xm[i],ym[i],zm[i],0.0,0.0,1.5,dxe[i],dye[i],dze[i],eps);
-    }
+//    }
+//    else{
+//      BCT[i] = 1;
+//      BCV[i] = nbodyGn(xm[i],ym[i],zm[i],0.0,0.0,1.5,dxe[i],dye[i],dze[i],eps);
+//    }
   }
 
   nb = ic; // this is ok, but we should improve memory allocation
   assert( nb == ic );
 
-  for (m = 0; m < nb; m++){
-    rhs[m] = 0.0;
-    for (k = 0; k < nb; k++){
-      D1 = 0.0;
-      D2 = 0.0;
-      for (i = 0; i < 16; i++){
-        ut  = tg[i]*(1.0-vg[i]);
-        xk  = PCv00[k]*ut+PCv01[k]*vg[i]+PCv02[k];
-        yk  = PCv10[k]*ut+PCv11[k]*vg[i]+PCv12[k];
-        zk  = PCv20[k]*ut+PCv21[k]*vg[i]+PCv22[k];
-        D1 += nbodyG(xk,yk,zk,xm[m],ym[m],zm[m],eps)*(1.0-vg[i]);
-        D2 += nbodyGn(xk,yk,zk,xm[m],ym[m],zm[m],dxe[k],dye[k],dze[k],eps)*(1.0-vg[i]);
-      }
-      D1 = jac[k]*D1/16.0;
-      D2 = jac[k]*D2/16.0;
+  for(m=0;m<nb;m++){
+    rhs[m] = jac[m] * BCV[m];
+    D_1[m] = 0;
+    D_2[m] = 0;
+  }
 
-      if (k == m)
-        del = 1.0;
-      else
-        del = 0.0;
+  nbodyGL(nb, xk, yk, zk, D_1, nb, xm, ym, zm, re, eps, rhs);
+  nbodyGnL(nb, xk, yk, zk, D_2, nb, xm, ym, zm, re, dxe, dye, dze, eps, rhs);
 
-      if (BCT[k] == 0){
-        rhs[m] += BCV[k]*(-D2+0.5*del);
-        #if DENSE
-          PetscReal value = -D1;
-          MatSetValue(G,m,k,value,INSERT_VALUES);
-        #endif
-      }
-      else{
-        rhs[m] += BCV[k]*D1;
-        #if DENSE
-          PetscReal value = D2-0.5*del;
-          MatSetValue(G,m,k,value,INSERT_VALUES);
-        #endif
-      }
+  for (i=0; i<nb; i++) {
+    if (BCT[k] == 0)
+      rhs[k] = -D_2[k]+0.5*BCV[k];
+    else
+      rhs[k] = D_1[k];
+    VecSetValue(f,i,rhs[i],INSERT_VALUES);
+#if DENSE
+    for (j=0; j<=i; j++) {
+      PetscReal dx = xm[i] - xm[j];
+      PetscReal dy = ym[i] - ym[j];
+      PetscReal dz = zm[i] - zm[j];
+      PetscReal r = sqrt(dx * dx + dy * dy + dz * dz + eps * eps);
+      PetscReal value = -(0.25/r/M_PI);
+      MatSetValue(G,i,j,value,INSERT_VALUES);
+      MatSetValue(G,j,i,value,INSERT_VALUES);
     }
-    VecSetValue(f,m,rhs[m],INSERT_VALUES);
+#endif
   }
 
   ierr = VecAssemblyBegin(f);CHKERRQ(ierr);
   ierr = VecAssemblyEnd(f);CHKERRQ(ierr);
-  #if DENSE
-    ierr = MatAssemblyBegin(G,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-    ierr = MatAssemblyEnd(G,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  #endif
-   
+#if DENSE
+  ierr = MatAssemblyBegin(G,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
+  ierr = MatAssemblyEnd(G,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
+#endif
+
   KSP ksp;
   ierr = KSPCreate(PETSC_COMM_WORLD,&ksp);CHKERRQ(ierr);
   ierr = KSPSetOperators(ksp,G,G,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
 
   for (m = 0; m < nb; m++){
     if (BCT[m]==0){
-      phi[m]  = BCV[m];
-      dphi[m] = un[m];
+      phi[m]  = BCV[m]*jac[m];
+      dphi[m] = un[m]*jac[m];
     }
     else{
-      phi[m]  = un[m];
-      dphi[m] = BCV[m];
+      phi[m]  = un[m]*jac[m];
+      dphi[m] = BCV[m]*jac[m];
     }
   }
+  
+  nbodyGL(ni, xk, yk, zk, D_3, nb, xi, yi, zi, re, eps, dphi);
+  nbodyGnL(ni, xk, yk, zk, D_4, nb, xi, yi, zi, re, dxe, dye, dze, eps, phi);
 
-  for (j = 0; j < ni; j++){
-    for (k = 0; k < nb; k++){
-      D1 = 0.0;
-      D2 = 0.0;
-      for (i = 0; i < 16; i++){
-        ut  = tg[i]*(1.0-vg[i]);
-        xk  = PCv00[k]*ut+PCv01[k]*vg[i]+PCv02[k];
-        yk  = PCv10[k]*ut+PCv11[k]*vg[i]+PCv12[k];
-        zk  = PCv20[k]*ut+PCv21[k]*vg[i]+PCv22[k];
-        D1 += nbodyG(xk,yk,zk,xi[j],yi[j],zi[j],eps)*(1.0-vg[i]);
-        D2 += nbodyGn(xk,yk,zk,xi[j],yi[j],zi[j],dxe[k],dye[k],dze[k],eps)*(1.0-vg[i]);
-      }
-      D1 = jac[k]*D1/16.0;
-      D2 = jac[k]*D2/16.0;
-      ui[j] += phi[k]*D2-dphi[k]*D1;
-    }
-  }
+  for (k = 0; k < ni; k++)
+    ui[k] = D_4[k] - D_3[k];
 
   ic = 0;
   for (i=info.xs; i<info.xs+info.xm; i++) { // Loop over interior points
       }
     }
   }
-  
+
   ierr = DMDAVecRestoreArray(da,x,&X);CHKERRQ(ierr);
   ierr = DMDAVecRestoreArray(da,y,&Y);CHKERRQ(ierr);
 

File fmm/fmm3D.h

View file
  • Ignore whitespace
   Vec u,f;
   Mat G;
   PetscInt ni,nb,*BCT;
-  PetscReal *xi,*yi,*zi,*ri,*ub,*xm,*ym,*zm,*um,*dxe,*dye,*dze,*re,*rhs,*un,*ui,*jac,eps;
+  PetscReal *D_1, *D_2, *D_3, *D_4, *xk, *yk, *zk, *xi,*yi,*zi,*ri,*ub,*xm,*ym,*zm,*um,*dxe,*dye,*dze,*re,*rhs,*un,*ui,*jac,eps;
   PetscReal *BCV, *xb0,*yb0,*zb0,*xb1,*yb1,*zb1,*xb2,*yb2,*zb2, *tg, *vg, *phi, *dphi;
   PetscReal *PCv00, *PCv01, *PCv02, *PCv10, *PCv11, *PCv12, *PCv20, *PCv21, *PCv22;
 } PC_FMM;

File fmm/makefile

View file
  • Ignore whitespace
 2d: Poisson2D.o fmm2D.o
 	${CLINKER} $^ ${PETIGA_LIB} -L../../exafmm2d/wrappers -lfmm -ltbb -lm
-	mpirun -np 1 ./a.out -ksp_monitor -ksp_norm_type PRECONDITIONED -ksp_max_it 100 -pc_type hypre 
+	mpirun -np 1 ./a.out -ksp_monitor -ksp_norm_type PRECONDITIONED -ksp_max_it 100 -pc_type fmm 
 
 3d: Poisson3D.o fmm3D.o
-	${CLINKER} $^ ${PETIGA_LIB} -L../../exafmm2d/wrappers -lfmm -ltbb -lm
-	mpirun -np 1 ./a.out -ksp_monitor -ksp_norm_type PRECONDITIONED -ksp_max_it 15 -pc_type hypre
+	${CLINKER} $^ ${PETIGA_LIB} -L../../exafmm-dev/wrappers -fopenmp -lfmm -ltbb -lm
+	mpirun -np 1 ./a.out -ksp_monitor -ksp_norm_type PRECONDITIONED -ksp_max_it 15 -pc_type fmm
 
 clean::
 	-@${RM} *.o *.out

File fmm/nbody3D.h

View file
  • Ignore whitespace
   PetscReal dz = zi - zj;
   PetscReal rdn = dx * dxe + dy * dye + dz * dze;
   PetscReal r2 = dx * dx + dy * dy + dz * dz + eps2;
-   
   return 0.25 * rdn / pow(r2,1.5) / M_PI;
 }
-/*void nbodyG(int ni, PetscReal *xi, PetscReal *yi, PetscReal *zi, PetscReal *vi,
-            int nj, PetscReal *xj, PetscReal *yj, PetscReal *zj, PetscReal *vj, PetscReal eps) {
-  PetscInt i, j;
+
+void nbodyGL(int ni, PetscReal *xi, PetscReal *yi, PetscReal *zi, PetscReal *vi,
+            int nj, PetscReal *xj, PetscReal *yj, PetscReal *zj, PetscReal *vj, PetscReal eps, PetscReal *jac) {
+  PetscInt i, j, k, ic;
   PetscReal eps2 = eps * eps;
   for (i=0; i<ni; i++) {
-    PetscReal v = 0;
+    ic = 0;
     for (j=0; j<nj; j++) {
-      PetscReal dx = xi[i] - xj[j];
-      PetscReal dy = yi[i] - yj[j];
-      PetscReal dz = zi[i] - zj[j];
-      PetscReal r = sqrt(dx * dx + dy * dy + dz * dz + eps2);
-      v -= vj[j] * (0.25/r/M_PI);
+      PetscReal v = 0;
+      for (k = 0; k < 16; k++){
+        PetscReal dx = xi[ic] - xj[i];
+        PetscReal dy = yi[ic] - yj[i];
+        PetscReal dz = zi[ic] - zj[i];
+        PetscReal r = sqrt(dx * dx + dy * dy + dz * dz + eps2);
+        v -= vj[ic] * (0.25/r/M_PI);
+        ic++;
+      }
+      vi[i] += v*jac[j]/16;
     }
-    vi[i] += v;
   }
 }
 
-void nbodyGn(int ni, PetscReal *xi, PetscReal *yi, PetscReal *zi, PetscReal *vi,
+void nbodyGnL(int ni, PetscReal *xi, PetscReal *yi, PetscReal *zi, PetscReal *vi,
              int nj, PetscReal *xj, PetscReal *yj, PetscReal *zj, PetscReal *vj,
-	     PetscReal *dxe, PetscReal *dye, PetscReal *dze, PetscReal *re, PetscReal eps) {
-  PetscInt i, j;
+	     PetscReal *dxe, PetscReal *dye, PetscReal *dze, PetscReal eps, PetscReal *jac) {
+  PetscInt i, j, k, ic;
   PetscReal eps2 = eps * eps;
   for (i=0; i<ni; i++) {
-    PetscReal v = 0;
+    ic = 0;
     for (j=0; j<nj; j++) {
-      PetscReal dx = xi[i] - xj[j];
-      PetscReal dy = yi[i] - yj[j];
-      PetscReal dz = zi[i] - zj[j];
-      PetscReal rdn = dx * dxe[j] + dy * dye[j] + dz * dze[j];
-      PetscReal r2 = dx * dx + dy * dy + dz * dz + eps2;
-      v += vj[j] * 0.25 * rdn / pow(re[j],1.5) / M_PI;
+      PetscReal v = 0;
+      for (k = 0; k < 16; k++){
+        PetscReal dx = xi[ic] - xj[i];
+        PetscReal dy = yi[ic] - yj[i];
+        PetscReal dz = zi[ic] - zj[i];
+        PetscReal rdn = dx * dxe[j] + dy * dye[j] + dz * dze[j];
+        PetscReal r2 = dx * dx + dy * dy + dz * dz + eps2;
+        v += vj[ic] * 0.25 * rdn / pow(r2,1.5) / M_PI;
+        ic++;
+      }
+      vi[i] += v*jac[j]/16;
     }
-    vi[i] += v;
   }
-}*/
+}