Commits

Matthew Turk committed 1413fef

Trying to get a simple grid summation going

Comments (0)

Files changed (2)

+__kernel void grid_sum(
+    __read_only image3d_t input_grid,
+    __write_only image2d_t output_plane,
+    uint nz
+)
+{
+    const sampler_t grid_reader =
+        CLK_NORMALIZED_COORDS_FALSE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+    __private float4 my_val = (float4)(0.0, 0.0, 0.0, 0.0);
+    __private int xi = get_global_id(0);
+    __private int yi = get_global_id(1);
+    __local int zi;
+    for (zi = 0; zi < nz; zi++) {
+        my_val += read_imagef(input_grid, grid_reader, (int4)(0, zi, yi, xi));
+    }
+    write_imagef(output_plane, (int2)(yi, xi), my_val);
+}
+import matplotlib;matplotlib.use("Agg");import pylab
+from yt.config import ytcfg
+from yt.mods import *
+import pyopencl as cl
+import pyopencl.array as cl_array
+from contextlib import contextmanager
+
+times = {}
+
+@contextmanager
+def time_func(name):
+    t1 = time.time()
+    yield
+    t2 = time.time()
+    print "%s took %0.3e seconds" % (name, t2-t1)
+    times[name] = t2-t1
+
+pf = load("JHK-DD0030/galaxy0030")
+#pf = load("DD0087/DD0087")
+
+# This part is specific to Hex
+platforms = cl.get_platforms()
+GPU = platforms[0]
+CPU = platforms[1]
+devices = GPU.get_devices()
+
+context = cl.Context(devices)
+command_queue = cl.CommandQueue(context)
+
+prg = cl.Program(context, open("grid_sum.cl").read()).build()
+ss = prg.grid_sum
+#ss.set_scalar_arg_dtypes((None, None, na.uint32))
+
+with time_func("Load from disk"):
+    for g in pf.h.grids: g["Density"]
+
+GPUARR = {}
+OUTIMG = {}
+CPUIMG = {}
+
+gformat = cl.ImageFormat(cl.channel_order.R,
+                         cl.channel_type.FLOAT)
+
+with time_func("Copy to GPU"):
+    for g in pf.h.grids:
+        GPUARR[g.id] = cl.Image(context, cl.mem_flags.READ_ONLY,
+                                gformat, hostbuf = g["Density"])
+    cl.enqueue_barrier(command_queue)
+
+with time_func("Create output images on GPU"):
+    for g in pf.h.grids:
+        CPUIMG[g.id] = na.zeros((g.ActiveDimensions[1], g.ActiveDimensions[0]), dtype='float32')
+        OUTIMG[g.id] = cl.Image(context, cl.mem_flags.READ_ONLY,
+                                gformat, hostbuf = CPUIMG[g.id])
+    cl.enqueue_barrier(command_queue)
+
+with time_func("Grid sum into images"):
+    ad = na.empty(1, dtype='uint32')
+    for g in pf.h.grids:
+        ad[0] = g.ActiveDimensions[2]
+        ss(command_queue, 
+            (g.ActiveDimensions[0], g.ActiveDimensions[1]),
+            (g.ActiveDimensions[0], g.ActiveDimensions[1]),
+            GPUARR[g.id], OUTIMG[g.id], na.uint32(ad[0]))
+    cl.enqueue_barrier(command_queue)
+
+with time_func("Copy images back"):
+    for g in pf.h.grids:
+        cl.enqueue_copy(command_queue, CPUIMG[g.id], OUTIMG[g.id],
+                (0,0), (g.ActiveDimensions[1], g.ActiveDimensions[0]))
+    cl.enqueue_barrier(command_queue)
+
+#val_gpu = val_gpu.get()[0]
+#print "On GPU calculated a sum of: %0.9e" % (val_gpu)
+
+with time_func("De-allocate on GPU"):
+    GPUARR.clear()
+    OUTIMG.clear()
+
+with time_func("Sum with NumPy"):
+    val_cpu = dict([(g.id, g["Density"].sum(axis=2)) for g in pf.h.grids])
+
+max_delta = 0.0
+for g in pf.h.grids:
+    i1 = val_cpu[g.id]
+    i2 = CPUIMG[g.id].transpose()
+    max_delta = max(max_delta, (na.abs(i1 - i2)/(i1 + i2)).max())
+print "Max delta: %0.3e" % (max_delta)
+