ex7: vibe

This commit is contained in:
2025-11-04 13:50:16 +01:00
parent 0813765831
commit 1f62b76347

View File

@@ -1,4 +1,5 @@
#include <cuda_runtime_api.h>
#include <driver_types.h>
#include <errno.h>
#include <inttypes.h>
#include <math.h>
@@ -13,7 +14,6 @@
// Include the cooperative groups library
// BEGIN: T1
#include <cooperative_groups.h>
namespace cg = cooperative_groups; // TODO
// END: T1
// Convert 'struct timeval' into seconds in double prec. floating point
@@ -34,6 +34,9 @@ int_t
max_iteration = 1000000,
snapshot_freq = 1000;
#define BLOCKX 16
#define BLOCKY 16
// Wave equation parameters, time step is derived from the space step
const real_t
c = 1.0,
@@ -43,8 +46,8 @@ real_t
dt;
// Buffers for three time steps, indexed with 2 ghost points for the boundary
real_t
*buffers[3] = { NULL, NULL, NULL };
real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers
real_t *h_buffer = NULL;
#define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1]
#define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1]
@@ -109,23 +112,29 @@ void domain_save(int_t step) {
// Get rid of all the memory allocations
void domain_finalize(void) {
// BEGIN: T4
free(buffers[0]);
free(buffers[1]);
free(buffers[2]);
cudaFree(buffers[0]);
cudaFree(buffers[1]);
cudaFree(buffers[2]);
free(h_buffer);
// END: T4
}
// TASK: T6
// Neumann (reflective) boundary condition
// BEGIN: T6
void boundary_condition(void) {
for (int_t i = 0; i < M; i++) {
U(i, -1) = U(i, 1);
U(i, N) = U(i, N - 2);
__global__ void boundary_condition(real_t *u, int_t M, int_t N) {
int_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < M) {
int_t i = idx;
u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2];
u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)];
}
for (int_t j = 0; j < N; j++) {
U(-1, j) = U(1, j);
U(M, j) = U(M - 2, j);
if (idx < N) {
int_t j = idx;
u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)];
u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)];
}
}
// END: T6
@@ -133,12 +142,25 @@ void boundary_condition(void) {
// TASK: T5
// Integration formula
// BEGIN: T5
void time_step(void) {
for (int_t i = 0; i < M; i++) {
for (int_t j = 0; j < N; j++) {
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (dx * dy) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
}
}
__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt,
int_t M, int_t N, real_t c, real_t dt,
real_t dx, real_t dy) {
int_t i = blockIdx.y * blockDim.y + threadIdx.y;
int_t j = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= M || j >= N)
return;
int_t idx = (i + 1) * (N + 2) + (j + 1);
int_t idx_up = (i + 2) * (N + 2) + (j + 1);
int_t idx_down = (i) * (N + 2) + (j + 1);
int_t idx_right = (i + 1) * (N + 2) + (j + 2);
int_t idx_left = (i + 1) * (N + 2) + (j);
real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
}
// END: T5
@@ -147,18 +169,21 @@ void time_step(void) {
void simulate(void) {
// BEGIN: T7
// Go through each time step
for (int_t iteration = 0; iteration <= max_iteration; iteration++) {
if ((iteration % snapshot_freq) == 0) {
domain_save(iteration / snapshot_freq);
}
dim3 blockDim(16, 16);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
(M + blockDim.y - 1) / blockDim.y);
// Derive step t+1 from steps t and t-1
boundary_condition();
time_step();
int_t boundary_threads = M > N ? M : N;
int_t boundary_blocks = (boundary_threads + 255) / 256;
// Rotate the time step buffers
move_buffer_window();
}
time_step<<<gridDim, blockDim>>>(buffers[0], buffers[1], buffers[2],
M, N, c, dt, dx, dy);
boundary_condition<<<boundary_blocks, 256>>>(buffers[2], M, N);
cudaDeviceSynchronize();
move_buffer_window();
// END: T7
}
@@ -166,7 +191,29 @@ void simulate(void) {
// GPU occupancy
void occupancy(void) {
// BEGIN: T8
;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
dim3 blockDim(BLOCKX, BLOCKY); // 256 threads per block
int threads_per_block = blockDim.x * blockDim.y;
int warps_per_block = (threads_per_block + 31) / 32;
int max_warps_per_sm = prop.maxThreadsPerMultiProcessor / 32;
int max_blocks_per_sm = prop.maxThreadsPerMultiProcessor / threads_per_block;
int active_warps = max_blocks_per_sm * warps_per_block;
real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
if (occupancy_ratio > 1.0)
occupancy_ratio = 1.0;
printf("GPU Occupancy: %.2f%%\n", occupancy_ratio * 100.0);
printf("Active warps per SM: %d\n", active_warps);
printf("Maximum warps per SM: %d\n", max_warps_per_sm);
printf("Threads per block: %d\n", threads_per_block);
printf("Max blocks per SM: %d\n", max_blocks_per_sm);
// END: T8
}
@@ -211,9 +258,13 @@ void domain_initialize(void) {
exit(EXIT_FAILURE);
}
buffers[0] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
buffers[1] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
buffers[2] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
cudaMalloc(&buffers[0], size);
cudaMalloc(&buffers[1], size);
cudaMalloc(&buffers[2], size);
h_buffer = (real_t *)malloc(size);
for (int_t i = 0; i < M; i++) {
for (int_t j = 0; j < N; j++) {
@@ -224,6 +275,10 @@ void domain_initialize(void) {
}
}
cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
cudaMemset(buffers[2], 0, size);
// Set the time step for 2D case
dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
}