Commits

Stan Seibert  committed 4734248

Test of uncoalesced writes

  • Participants
  • Parent commits 130479a

Comments (0)

Files changed (2)

File no_coalesce.cu

+/* -*- c++ -*- */
+
+#include <cuda.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* Yes, I'm going to hell. */
+#  define CUDA_CHECK_ERROR( call) do {                                         \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+	      __FILE__, __LINE__, cudaGetErrorString( err) );              \
+      exit(EXIT_FAILURE);                                                      \
+    } } while (0)
+
+///////////////// Kernels ///////////////////////
+
+#define THREADS_PER_BLOCK 256
+
+__global__ void no_coalesce(unsigned int *output)
+{
+  unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+  // Transpose the block and thread ID to break coalescing
+  // maximally
+  unsigned int write_index = threadIdx.x * gridDim.x + blockIdx.x;
+  output[write_index] = index;
+ }
+
+
+
+////////////////// Host code ///////////
+
+int main()
+{
+  const unsigned int blocks = 65535;
+  const unsigned int n = blocks * THREADS_PER_BLOCK;
+
+  // Device information
+  int device;
+  CUDA_CHECK_ERROR(cudaGetDevice(&device));
+  struct cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int proc_per_multiproc = 8;
+  if (prop.major == 2) proc_per_multiproc = 32;
+  printf("Device name: %s\n", prop.name);
+  // Bogus normalization metric
+  float bogogflops = 2 * prop.clockRate * prop.multiProcessorCount * proc_per_multiproc / 1e6;
+  printf("BogoGFLOPS: %1.1f\n\n", bogogflops); 
+
+  printf("Size of array: %d elements\n", n);
+
+  // Allocate arrays
+  unsigned int *d_output;
+  CUDA_CHECK_ERROR(cudaMalloc((void **) &d_output, sizeof(unsigned int) * n));
+
+  // Warmup
+  no_coalesce<<<blocks, THREADS_PER_BLOCK>>>(d_output);
+
+  CUDA_CHECK_ERROR(cudaThreadSynchronize());
+
+  cudaEvent_t start, stop;
+  float elapsedTime;
+  CUDA_CHECK_ERROR(cudaEventCreate(&start));
+  CUDA_CHECK_ERROR(cudaEventCreate(&stop));
+
+  CUDA_CHECK_ERROR(cudaEventRecord(start, 0));
+
+  no_coalesce<<<blocks, THREADS_PER_BLOCK>>>(d_output);
+
+  CUDA_CHECK_ERROR(cudaEventRecord(stop, 0));
+  CUDA_CHECK_ERROR(cudaEventSynchronize(stop));
+  CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start,stop));
+
+  // Compute effective memory bandwidth.
+  printf("Output Queue: %1.3f ms, %1.1f MB/sec\n", elapsedTime,
+	 ((float) n / elapsedTime * 1e3 / 1024.0 / 1024.0 * 4));
+
+  return 0;
+}
   unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (index < n) {
-    int value = input[index];
+    //int value = input[index];
+    int value = (int) index;
 
     // Only put odd values into the output queue
     if (value & 1) {