From 1f62b76347f4b6e01f10fd09b0951be0df7fcb79 Mon Sep 17 00:00:00 2001 From: fredrikr79 Date: Tue, 4 Nov 2025 13:50:16 +0100 Subject: [PATCH] ex7: vibe --- exercise7/wave_2d_parallel.cu | 121 ++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 33 deletions(-) diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu index 156df50..bfe1844 100644 --- a/exercise7/wave_2d_parallel.cu +++ b/exercise7/wave_2d_parallel.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -13,7 +14,6 @@ // Include the cooperative groups library // BEGIN: T1 #include -namespace cg = cooperative_groups; // TODO // END: T1 // Convert 'struct timeval' into seconds in double prec. floating point @@ -34,6 +34,9 @@ int_t max_iteration = 1000000, snapshot_freq = 1000; +#define BLOCKX 16 +#define BLOCKY 16 + // Wave equation parameters, time step is derived from the space step const real_t c = 1.0, @@ -43,8 +46,8 @@ real_t dt; // Buffers for three time steps, indexed with 2 ghost points for the boundary -real_t - *buffers[3] = { NULL, NULL, NULL }; +real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers +real_t *h_buffer = NULL; #define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1] #define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1] @@ -109,23 +112,29 @@ void domain_save(int_t step) { // Get rid of all the memory allocations void domain_finalize(void) { // BEGIN: T4 - free(buffers[0]); - free(buffers[1]); - free(buffers[2]); + cudaFree(buffers[0]); + cudaFree(buffers[1]); + cudaFree(buffers[2]); + free(h_buffer); // END: T4 } // TASK: T6 // Neumann (reflective) boundary condition // BEGIN: T6 -void boundary_condition(void) { - for (int_t i = 0; i < M; i++) { - U(i, -1) = U(i, 1); - U(i, N) = U(i, N - 2); +__global__ void boundary_condition(real_t *u, int_t M, int_t N) { + int_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < M) { + int_t i = idx; + u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2]; + u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)]; } - for (int_t j = 0; j < N; j++) { - U(-1, j) = U(1, j); - U(M, j) = U(M - 2, j); + + if (idx < N) { + int_t j = idx; + u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)]; + u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)]; } } // END: T6 @@ -133,12 +142,25 @@ void boundary_condition(void) { // TASK: T5 // Integration formula // BEGIN: T5 -void time_step(void) { - for (int_t i = 0; i < M; i++) { - for (int_t j = 0; j < N; j++) { - U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (dx * dy) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j)); - } - } +__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt, + int_t M, int_t N, real_t c, real_t dt, + real_t dx, real_t dy) { + int_t i = blockIdx.y * blockDim.y + threadIdx.y; + int_t j = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= M || j >= N) + return; + + int_t idx = (i + 1) * (N + 2) + (j + 1); + int_t idx_up = (i + 2) * (N + 2) + (j + 1); + int_t idx_down = (i) * (N + 2) + (j + 1); + int_t idx_right = (i + 1) * (N + 2) + (j + 2); + int_t idx_left = (i + 1) * (N + 2) + (j); + + real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx); + real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy); + + u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2); } // END: T5 @@ -147,18 +169,21 @@ void time_step(void) { void simulate(void) { // BEGIN: T7 // Go through each time step - for (int_t iteration = 0; iteration <= max_iteration; iteration++) { - if ((iteration % snapshot_freq) == 0) { - domain_save(iteration / snapshot_freq); - } + dim3 blockDim(16, 16); + dim3 gridDim((N + blockDim.x - 1) / blockDim.x, + (M + blockDim.y - 1) / blockDim.y); - // Derive step t+1 from steps t and t-1 - boundary_condition(); - time_step(); + int_t boundary_threads = M > N ? M : N; + int_t boundary_blocks = (boundary_threads + 255) / 256; - // Rotate the time step buffers - move_buffer_window(); - } + time_step<<>>(buffers[0], buffers[1], buffers[2], + M, N, c, dt, dx, dy); + + boundary_condition<<>>(buffers[2], M, N); + + cudaDeviceSynchronize(); + + move_buffer_window(); // END: T7 } @@ -166,7 +191,29 @@ void simulate(void) { // GPU occupancy void occupancy(void) { // BEGIN: T8 - ; + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + + dim3 blockDim(BLOCKX, BLOCKY); // 256 threads per block + int threads_per_block = blockDim.x * blockDim.y; + + int warps_per_block = (threads_per_block + 31) / 32; + + int max_warps_per_sm = prop.maxThreadsPerMultiProcessor / 32; + + int max_blocks_per_sm = prop.maxThreadsPerMultiProcessor / threads_per_block; + int active_warps = max_blocks_per_sm * warps_per_block; + + real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm; + + if (occupancy_ratio > 1.0) + occupancy_ratio = 1.0; + + printf("GPU Occupancy: %.2f%%\n", occupancy_ratio * 100.0); + printf("Active warps per SM: %d\n", active_warps); + printf("Maximum warps per SM: %d\n", max_warps_per_sm); + printf("Threads per block: %d\n", threads_per_block); + printf("Max blocks per SM: %d\n", max_blocks_per_sm); // END: T8 } @@ -211,9 +258,13 @@ void domain_initialize(void) { exit(EXIT_FAILURE); } - buffers[0] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t)); - buffers[1] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t)); - buffers[2] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t)); + size_t size = (M + 2) * (N + 2) * sizeof(real_t); + + cudaMalloc(&buffers[0], size); + cudaMalloc(&buffers[1], size); + cudaMalloc(&buffers[2], size); + + h_buffer = (real_t *)malloc(size); for (int_t i = 0; i < M; i++) { for (int_t j = 0; j < N; j++) { @@ -224,6 +275,10 @@ void domain_initialize(void) { } } + cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice); + cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice); + cudaMemset(buffers[2], 0, size); + // Set the time step for 2D case dt = dx * dy / (c * sqrt(dx * dx + dy * dy)); }