Commits

Stan Seibert committed c796ec7

Cache test for simple sum

Comments (0)

Files changed (1)

+/* -*- c++ -*- */
+
+#include <cuda.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* Yes, I'm going to hell. */
+#  define CUDA_CHECK_ERROR( call) do {                                         \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+	      __FILE__, __LINE__, cudaGetErrorString( err) );              \
+      exit(EXIT_FAILURE);                                                      \
+    } } while (0)
+
+///////////////// Kernels ///////////////////////
+
+#define THREADS_PER_BLOCK 256
+
+__global__ void sum_simple(int n, float *a, float *b)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < n - 3)
+    b[index] = a[index] + a[index+1] + a[index+2] + a[index+3];
+}
+
+__global__ void sum_shared(int n, float *a, float *b)
+{
+  __shared__ float temp[THREADS_PER_BLOCK + 3];
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Grab block
+  if (index < n)
+    temp[threadIdx.x] = a[index];
+  // and 3 stragglers beyond
+  if (threadIdx.x < 3 && index + THREADS_PER_BLOCK < n)
+    temp[threadIdx.x + THREADS_PER_BLOCK] = a[index + THREADS_PER_BLOCK];
+
+  __syncthreads();
+
+  if (index < n - 3)
+    b[index] = temp[threadIdx.x] + temp[threadIdx.x+1] + temp[threadIdx.x+2]
+      + temp[threadIdx.x+3];
+}
+
+
+////////////////// Host code ///////////
+
+int main()
+{
+  const int blocks = 65535;
+  const int n = blocks * THREADS_PER_BLOCK;
+
+  // Device information
+  int device;
+  CUDA_CHECK_ERROR(cudaGetDevice(&device));
+  struct cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  int proc_per_multiproc = 8;
+  if (prop.major == 2) proc_per_multiproc = 32;
+  printf("Device name: %s\n", prop.name);
+  // Bogus normalization metric
+  float bogogflops = 2 * prop.clockRate * prop.multiProcessorCount * proc_per_multiproc / 1e6;
+  printf("BogoGFLOPS: %1.1f\n\n", bogogflops); 
+
+  printf("\nSize of array: %d elements\n", n);
+
+  // Allocate arrays
+  float *h_a = new float[n];
+  float *d_a;
+  float *d_b;
+  CUDA_CHECK_ERROR(cudaMalloc((void **) &d_a, sizeof(float) * n));
+  CUDA_CHECK_ERROR(cudaMalloc((void **) &d_b, sizeof(float) * n));
+
+  // Generate some data
+  for (int i=0; i < n; i++)
+    h_a[i] = 1.0f;
+  
+  // Set device memory
+  CUDA_CHECK_ERROR(cudaMemcpy(d_a, h_a, 
+			      sizeof(float) * n,
+			      cudaMemcpyHostToDevice));
+
+  // Simple copy version
+  
+  // Warmup
+  sum_simple<<<blocks, THREADS_PER_BLOCK>>>(n, d_a, d_b);
+  CUDA_CHECK_ERROR(cudaThreadSynchronize());
+
+  cudaEvent_t start, stop;
+  float elapsedTime;
+  CUDA_CHECK_ERROR(cudaEventCreate(&start));
+  CUDA_CHECK_ERROR(cudaEventCreate(&stop));
+
+  CUDA_CHECK_ERROR(cudaEventRecord(start, 0));
+  sum_simple<<<blocks, THREADS_PER_BLOCK>>>(n, d_a, d_b);
+  CUDA_CHECK_ERROR(cudaEventRecord(stop, 0));
+  CUDA_CHECK_ERROR(cudaEventSynchronize(stop));
+  CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start,stop));
+  printf("Simple sum: %1.3f ms, %1.1f MB/sec\n", elapsedTime,
+	 ((float) n / elapsedTime * 1e3 / 1024.0 / 1024.0));
+
+  // Simple copy version
+  
+  // Warmup
+  sum_shared<<<blocks, THREADS_PER_BLOCK>>>(n, d_a, d_b);
+  CUDA_CHECK_ERROR(cudaThreadSynchronize());
+
+  CUDA_CHECK_ERROR(cudaEventRecord(start, 0));
+  sum_shared<<<blocks, THREADS_PER_BLOCK>>>(n, d_a, d_b);
+  CUDA_CHECK_ERROR(cudaEventRecord(stop, 0));
+  CUDA_CHECK_ERROR(cudaEventSynchronize(stop));
+  CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start,stop));
+  printf("Shared sum: %1.3f ms, %1.1f MB/sec\n", elapsedTime,
+	 ((float) n / elapsedTime * 1e3 / 1024.0 / 1024.0));
+
+  return 0;
+}