From 1f62b76347f4b6e01f10fd09b0951be0df7fcb79 Mon Sep 17 00:00:00 2001
From: fredrikr79 <fredrikrobertsen7@gmail.com>
Date: Tue, 4 Nov 2025 13:50:16 +0100
Subject: [PATCH] ex7: vibe

---
 exercise7/wave_2d_parallel.cu | 121 ++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 33 deletions(-)

diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu
index 156df50..bfe1844 100644
--- a/exercise7/wave_2d_parallel.cu
+++ b/exercise7/wave_2d_parallel.cu
@@ -1,4 +1,5 @@
 #include <cuda_runtime_api.h>
+#include <driver_types.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <math.h>
@@ -13,7 +14,6 @@
 // Include the cooperative groups library
 // BEGIN: T1
 #include <cooperative_groups.h>
-namespace cg = cooperative_groups; // TODO
 // END: T1
 
 // Convert 'struct timeval' into seconds in double prec. floating point
@@ -34,6 +34,9 @@ int_t
     max_iteration = 1000000,
     snapshot_freq = 1000;
 
+#define BLOCKX 16
+#define BLOCKY 16
+
 // Wave equation parameters, time step is derived from the space step
 const real_t
     c = 1.0,
@@ -43,8 +46,8 @@ real_t
     dt;
 
 // Buffers for three time steps, indexed with 2 ghost points for the boundary
-real_t
-    *buffers[3] = { NULL, NULL, NULL };
+real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers
+real_t *h_buffer = NULL;
 
 #define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1]
 #define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1]
@@ -109,23 +112,29 @@ void domain_save(int_t step) {
 // Get rid of all the memory allocations
 void domain_finalize(void) {
   // BEGIN: T4
-  free(buffers[0]);
-  free(buffers[1]);
-  free(buffers[2]);
+  cudaFree(buffers[0]);
+  cudaFree(buffers[1]);
+  cudaFree(buffers[2]);
+  free(h_buffer);
   // END: T4
 }
 
 // TASK: T6
 // Neumann (reflective) boundary condition
 // BEGIN: T6
-void boundary_condition(void) {
-  for (int_t i = 0; i < M; i++) {
-    U(i, -1) = U(i, 1);
-    U(i, N) = U(i, N - 2);
+__global__ void boundary_condition(real_t *u, int_t M, int_t N) {
+  int_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < M) {
+    int_t i = idx;
+    u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2];
+    u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)];
   }
-  for (int_t j = 0; j < N; j++) {
-    U(-1, j) = U(1, j);
-    U(M, j) = U(M - 2, j);
+
+  if (idx < N) {
+    int_t j = idx;
+    u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)];
+    u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)];
   }
 }
 // END: T6
@@ -133,12 +142,25 @@ void boundary_condition(void) {
 // TASK: T5
 // Integration formula
 // BEGIN: T5
-void time_step(void) {
-  for (int_t i = 0; i < M; i++) {
-    for (int_t j = 0; j < N; j++) {
-      U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (dx * dy) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
-    }
-  }
+__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt,
+                          int_t M, int_t N, real_t c, real_t dt,
+                          real_t dx, real_t dy) {
+  int_t i = blockIdx.y * blockDim.y + threadIdx.y;
+  int_t j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i >= M || j >= N)
+    return;
+
+  int_t idx = (i + 1) * (N + 2) + (j + 1);
+  int_t idx_up = (i + 2) * (N + 2) + (j + 1);
+  int_t idx_down = (i) * (N + 2) + (j + 1);
+  int_t idx_right = (i + 1) * (N + 2) + (j + 2);
+  int_t idx_left = (i + 1) * (N + 2) + (j);
+
+  real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
+  real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
+
+  u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
 }
 // END: T5
 
@@ -147,18 +169,21 @@ void time_step(void) {
 void simulate(void) {
   // BEGIN: T7
   // Go through each time step
-  for (int_t iteration = 0; iteration <= max_iteration; iteration++) {
-    if ((iteration % snapshot_freq) == 0) {
-      domain_save(iteration / snapshot_freq);
-    }
+  dim3 blockDim(16, 16);
+  dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
+               (M + blockDim.y - 1) / blockDim.y);
 
-    // Derive step t+1 from steps t and t-1
-    boundary_condition();
-    time_step();
+  int_t boundary_threads = M > N ? M : N;
+  int_t boundary_blocks = (boundary_threads + 255) / 256;
 
-    // Rotate the time step buffers
-    move_buffer_window();
-  }
+  time_step<<<gridDim, blockDim>>>(buffers[0], buffers[1], buffers[2],
+                                   M, N, c, dt, dx, dy);
+
+  boundary_condition<<<boundary_blocks, 256>>>(buffers[2], M, N);
+
+  cudaDeviceSynchronize();
+
+  move_buffer_window();
   // END: T7
 }
 
@@ -166,7 +191,29 @@ void simulate(void) {
 // GPU occupancy
 void occupancy(void) {
   // BEGIN: T8
-  ;
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+
+  dim3 blockDim(BLOCKX, BLOCKY); // 256 threads per block
+  int threads_per_block = blockDim.x * blockDim.y;
+
+  int warps_per_block = (threads_per_block + 31) / 32;
+
+  int max_warps_per_sm = prop.maxThreadsPerMultiProcessor / 32;
+
+  int max_blocks_per_sm = prop.maxThreadsPerMultiProcessor / threads_per_block;
+  int active_warps = max_blocks_per_sm * warps_per_block;
+
+  real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
+
+  if (occupancy_ratio > 1.0)
+    occupancy_ratio = 1.0;
+
+  printf("GPU Occupancy: %.2f%%\n", occupancy_ratio * 100.0);
+  printf("Active warps per SM: %d\n", active_warps);
+  printf("Maximum warps per SM: %d\n", max_warps_per_sm);
+  printf("Threads per block: %d\n", threads_per_block);
+  printf("Max blocks per SM: %d\n", max_blocks_per_sm);
   // END: T8
 }
 
@@ -211,9 +258,13 @@ void domain_initialize(void) {
     exit(EXIT_FAILURE);
   }
 
-  buffers[0] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
-  buffers[1] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
-  buffers[2] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
+  size_t size = (M + 2) * (N + 2) * sizeof(real_t);
+
+  cudaMalloc(&buffers[0], size);
+  cudaMalloc(&buffers[1], size);
+  cudaMalloc(&buffers[2], size);
+
+  h_buffer = (real_t *)malloc(size);
 
   for (int_t i = 0; i < M; i++) {
     for (int_t j = 0; j < N; j++) {
@@ -224,6 +275,10 @@ void domain_initialize(void) {
     }
   }
 
+  cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
+  cudaMemset(buffers[2], 0, size);
+
   // Set the time step for 2D case
   dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
 }