TDT4200/exercise7/wave_2d_parallel.cu

#include <cuda_runtime_api.h>
#include <driver_types.h>
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>

// TASK: T1
// Include the cooperative groups library
// BEGIN: T1
#include <cooperative_groups.h>
// END: T1

// Convert 'struct timeval' into seconds in double prec. floating point
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)

// Option to change numerical precision
typedef int64_t int_t;
typedef double real_t;

// TASK: T1b
// Variables needed for implementation
// BEGIN: T1b

// Simulation parameters: size, step count, and how often to save the state
int_t
    N = 128,
    M = 128,
    max_iteration = 1000000,
    snapshot_freq = 1000;

#define BLOCKX 16
#define BLOCKY 16

// Wave equation parameters, time step is derived from the space step
const real_t
    c = 1.0,
    dx = 1.0,
    dy = 1.0;
real_t
    dt;

// Buffers for three time steps, indexed with 2 ghost points for the boundary
real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers
real_t *h_buffer = NULL;

#define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1]
#define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1]
#define U_nxt(i, j) buffers[2][((i) + 1) * (N + 2) + (j) + 1]
// END: T1b

#define cudaErrorCheck(ans)               \
  {                                       \
    gpuAssert((ans), __FILE__, __LINE__); \
  }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort)
      exit(code);
  }
}

// Rotate the time step buffers.
void move_buffer_window(void) {
  real_t *temp = buffers[0];
  buffers[0] = buffers[1];
  buffers[1] = buffers[2];
  buffers[2] = temp;
}

// Save the present time step in a numbered file under 'data/'
void domain_save(int_t step) {
  char filename[256];

  // Ensure output directory exists (ignore error if it already exists)
  if (mkdir("data", 0755) != 0 && errno != EEXIST) {
    perror("mkdir data");
    exit(EXIT_FAILURE);
  }

  snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step);

  FILE *out = fopen(filename, "wb");
  if (out == NULL) {
    perror("fopen output file");
    fprintf(stderr, "Failed to open '%s' for writing.\n", filename);
    exit(EXIT_FAILURE);
  }

  for (int_t i = 0; i < M; ++i) {
    size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out);
    if (written != (size_t)N) {
      perror("fwrite");
      fclose(out);
      exit(EXIT_FAILURE);
    }
  }

  if (fclose(out) != 0) {
    perror("fclose");
    exit(EXIT_FAILURE);
  }
}

// TASK: T4
// Get rid of all the memory allocations
void domain_finalize(void) {
  // BEGIN: T4
  cudaFree(buffers[0]);
  cudaFree(buffers[1]);
  cudaFree(buffers[2]);
  free(h_buffer);
  // END: T4
}

// TASK: T6
// Neumann (reflective) boundary condition
// BEGIN: T6
__global__ void boundary_condition(real_t *u, int_t M, int_t N) {
  int_t idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < M) {
    int_t i = idx;
    u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2];
    u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)];
  }

  if (idx < N) {
    int_t j = idx;
    u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)];
    u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)];
  }
}
// END: T6

// TASK: T5
// Integration formula
// BEGIN: T5
__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt,
                          int_t M, int_t N, real_t c, real_t dt,
                          real_t dx, real_t dy) {
  int_t i = blockIdx.y * blockDim.y + threadIdx.y;
  int_t j = blockIdx.x * blockDim.x + threadIdx.x;

  if (i >= M || j >= N)
    return;

  int_t idx = (i + 1) * (N + 2) + (j + 1);
  int_t idx_up = (i + 2) * (N + 2) + (j + 1);
  int_t idx_down = (i) * (N + 2) + (j + 1);
  int_t idx_right = (i + 1) * (N + 2) + (j + 2);
  int_t idx_left = (i + 1) * (N + 2) + (j);

  real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
  real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);

  u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
}
// END: T5

// TASK: T7
// Main time integration.
void simulate(void) {
  // BEGIN: T7
  // Go through each time step
  dim3 blockDim(16, 16);
  dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
               (M + blockDim.y - 1) / blockDim.y);

  int_t boundary_threads = M > N ? M : N;
  int_t boundary_blocks = (boundary_threads + 255) / 256;

  time_step<<<gridDim, blockDim>>>(buffers[0], buffers[1], buffers[2],
                                   M, N, c, dt, dx, dy);

  boundary_condition<<<boundary_blocks, 256>>>(buffers[2], M, N);

  cudaDeviceSynchronize();

  move_buffer_window();
  // END: T7
}

// TASK: T8
// GPU occupancy
void occupancy(void) {
  // BEGIN: T8
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, 0);

  dim3 blockDim(BLOCKX, BLOCKY); // 256 threads per block
  int threads_per_block = blockDim.x * blockDim.y;

  int warps_per_block = (threads_per_block + 31) / 32;

  int max_warps_per_sm = prop.maxThreadsPerMultiProcessor / 32;

  int max_blocks_per_sm = prop.maxThreadsPerMultiProcessor / threads_per_block;
  int active_warps = max_blocks_per_sm * warps_per_block;

  real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;

  if (occupancy_ratio > 1.0)
    occupancy_ratio = 1.0;

  printf("GPU Occupancy: %.2f%%\n", occupancy_ratio * 100.0);
  printf("Active warps per SM: %d\n", active_warps);
  printf("Maximum warps per SM: %d\n", max_warps_per_sm);
  printf("Threads per block: %d\n", threads_per_block);
  printf("Max blocks per SM: %d\n", max_blocks_per_sm);
  // END: T8
}

// TASK: T2
// Make sure at least one CUDA-capable device exists
static bool init_cuda() {
  // BEGIN: T2
  int count;
  if (cudaGetDeviceCount(&count) != cudaSuccess)
    return false;

  printf("CUDA device count: %d\n", count);

  if (count > 0) {
    cudaDeviceProp p;
    if (cudaSetDevice(0) != cudaSuccess)
      return false;
    if (cudaGetDeviceProperties(&p, 0) != cudaSuccess)
      return false;

    printf("CUDA device #0:\n");
    printf("  Name: %s\n", p.name);
    printf("  Compute capability: %d.%d\n", p.major, p.minor);
    printf("  Multiprocessors: %d\n", p.multiProcessorCount);
    printf("  Warp size: %d\n", p.warpSize);
    printf("  Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("  Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0);
    printf("  Per-block registers: %d\n", p.regsPerBlock);
  }
  return true;
  // END: T2
}

// TASK: T3
// Set up our three buffers, and fill two with an initial perturbation
// Function to determine occupancy and optimal configuration

void domain_initialize(void) {
  // BEGIN: T3
  bool locate_cuda = init_cuda();
  if (!locate_cuda) {
    exit(EXIT_FAILURE);
  }

  size_t size = (M + 2) * (N + 2) * sizeof(real_t);

  cudaMalloc(&buffers[0], size);
  cudaMalloc(&buffers[1], size);
  cudaMalloc(&buffers[2], size);

  h_buffer = (real_t *)malloc(size);

  for (int_t i = 0; i < M; i++) {
    for (int_t j = 0; j < N; j++) {
      // Calculate delta (radial distance) adjusted for M x N grid
      real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M +
                          ((j - N / 2.0) * (j - N / 2.0)) / (real_t)N);
      U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
    }
  }

  cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
  cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
  cudaMemset(buffers[2], 0, size);

  // Set the time step for 2D case
  dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
}

int main(void) {
  // Set up the initial state of the domain
  domain_initialize();

  struct timeval t_start, t_end;

  gettimeofday(&t_start, NULL);
  simulate();
  gettimeofday(&t_end, NULL);

  printf("Total elapsed time: %lf seconds\n",
         WALLTIME(t_end) - WALLTIME(t_start));

  occupancy();

  // Clean up and shut down
  domain_finalize();
  exit(EXIT_SUCCESS);
}