Commits

Matthew Turk  committed 09357ef

Now it seems to work, although I am getting funky artifacts when run on the
GPU. CPU gives bitwise identical answers.

  • Participants
  • Parent commits 1413fef

Comments (0)

Files changed (2)

     __private int yi = get_global_id(1);
     __local int zi;
     for (zi = 0; zi < nz; zi++) {
-        my_val += read_imagef(input_grid, grid_reader, (int4)(0, zi, yi, xi));
+        my_val += read_imagef(input_grid, grid_reader, (int4)(xi, yi, zi, 0));
     }
-    write_imagef(output_plane, (int2)(yi, xi), my_val);
+    write_imagef(output_plane, (int2)(xi, yi), my_val.x);
 }
 OUTIMG = {}
 CPUIMG = {}
 
-gformat = cl.ImageFormat(cl.channel_order.R,
+gformat = cl.ImageFormat(cl.channel_order.LUMINANCE,
                          cl.channel_type.FLOAT)
 
 with time_func("Copy to GPU"):
     for g in pf.h.grids:
-        GPUARR[g.id] = cl.Image(context, cl.mem_flags.READ_ONLY,
-                                gformat, hostbuf = g["Density"])
+        GPUARR[g.id] = cl.Image(context, 
+            cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
+            gformat, hostbuf = g["Density"])
     cl.enqueue_barrier(command_queue)
 
 with time_func("Create output images on GPU"):
     for g in pf.h.grids:
-        CPUIMG[g.id] = na.zeros((g.ActiveDimensions[1], g.ActiveDimensions[0]), dtype='float32')
-        OUTIMG[g.id] = cl.Image(context, cl.mem_flags.READ_ONLY,
-                                gformat, hostbuf = CPUIMG[g.id])
+        CPUIMG[g.id] = na.zeros((g.ActiveDimensions[0], g.ActiveDimensions[1]),
+                                dtype='float32', order='F')
+        OUTIMG[g.id] = cl.Image(context,
+            cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR,
+            gformat, hostbuf = CPUIMG[g.id])
+        
     cl.enqueue_barrier(command_queue)
 
 with time_func("Grid sum into images"):
     for g in pf.h.grids:
         ad[0] = g.ActiveDimensions[2]
         ss(command_queue, 
-            (g.ActiveDimensions[0], g.ActiveDimensions[1]),
-            (g.ActiveDimensions[0], g.ActiveDimensions[1]),
+            (int(g.ActiveDimensions[0]), int(g.ActiveDimensions[1])),
+            None,
             GPUARR[g.id], OUTIMG[g.id], na.uint32(ad[0]))
     cl.enqueue_barrier(command_queue)
 
 with time_func("Copy images back"):
     for g in pf.h.grids:
         cl.enqueue_copy(command_queue, CPUIMG[g.id], OUTIMG[g.id],
-                (0,0), (g.ActiveDimensions[1], g.ActiveDimensions[0]))
+                origin=(0,0),
+                region=(int(g.ActiveDimensions[0]),
+                        int(g.ActiveDimensions[1])))
     cl.enqueue_barrier(command_queue)
 
 #val_gpu = val_gpu.get()[0]
 max_delta = 0.0
 for g in pf.h.grids:
     i1 = val_cpu[g.id]
-    i2 = CPUIMG[g.id].transpose()
+    i2 = CPUIMG[g.id]#.transpose()
     max_delta = max(max_delta, (na.abs(i1 - i2)/(i1 + i2)).max())
+    pylab.clf()
+    pylab.subplot(1,2,1)
+    pylab.imshow(na.log10(val_cpu[g.id]), interpolation='nearest')
+    pylab.subplot(1,2,2)
+    pylab.imshow(na.log10(CPUIMG[g.id]), interpolation='nearest')
+    pylab.savefig("images/grid_%05i.png" % (g.id))
 print "Max delta: %0.3e" % (max_delta)
+print CPUIMG[1]