diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu index 8f926b4..3eb7cab 100644 --- a/exercise7/wave_2d_parallel.cu +++ b/exercise7/wave_2d_parallel.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -10,49 +11,27 @@ #include #include -// TASK: T1 -// Include the cooperative groups library -// BEGIN: T1 -#include -// END: T1 +namespace cg = cooperative_groups; -// Convert 'struct timeval' into seconds in double prec. floating point #define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec) -// Option to change numerical precision typedef int64_t int_t; typedef double real_t; -// TASK: T1b -// Variables needed for implementation -// BEGIN: T1b - -// Simulation parameters: size, step count, and how often to save the state -int_t - N = 128, - M = 128, - max_iteration = 1000000, - snapshot_freq = 1000; +int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000; #define BLOCKX 16 #define BLOCKY 16 -// Wave equation parameters, time step is derived from the space step -const real_t - c = 1.0, - dx = 1.0, - dy = 1.0; -real_t - dt; +const real_t c = 1.0, dx = 1.0, dy = 1.0; +real_t dt; -// Buffers for three time steps, indexed with 2 ghost points for the boundary -real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers +real_t *buffers[3] = { NULL, NULL, NULL }; real_t *h_buffer = NULL; #define U_prv(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] #define U(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] #define U_nxt(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] -// END: T1b #define cudaErrorCheck(ans) \ { \ @@ -66,7 +45,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = } } -// Rotate the time step buffers. void move_buffer_window(void) { real_t *temp = buffers[0]; buffers[0] = buffers[1]; @@ -74,25 +52,19 @@ void move_buffer_window(void) { buffers[2] = temp; } -// Save the present time step in a numbered file under 'data/' void domain_save(int_t step) { char filename[256]; - - // Ensure output directory exists (ignore error if it already exists) if (mkdir("data", 0755) != 0 && errno != EEXIST) { perror("mkdir data"); exit(EXIT_FAILURE); } - snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step); - FILE *out = fopen(filename, "wb"); if (out == NULL) { perror("fopen output file"); fprintf(stderr, "Failed to open '%s' for writing.\n", filename); exit(EXIT_FAILURE); } - for (int_t i = 0; i < M; ++i) { size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out); if (written != (size_t)N) { @@ -101,116 +73,100 @@ void domain_save(int_t step) { exit(EXIT_FAILURE); } } - if (fclose(out) != 0) { perror("fclose"); exit(EXIT_FAILURE); } } -// TASK: T4 -// Get rid of all the memory allocations void domain_finalize(void) { - // BEGIN: T4 cudaFree(buffers[0]); cudaFree(buffers[1]); cudaFree(buffers[2]); free(h_buffer); - // END: T4 } -// TASK: T6 -// Neumann (reflective) boundary condition -// BEGIN: T6 -__global__ void boundary_condition(real_t *u, int_t M, int_t N) { - int_t idx = blockIdx.x * blockDim.x + threadIdx.x; +// combined kernel for both time step and boundary condition +__global__ void wave_equation_step(real_t *u_prv, real_t *u, real_t *u_nxt, + int_t M, int_t N, real_t c, real_t dt, + real_t dx, real_t dy) { + cg::grid_group grid = cg::this_grid(); + cg::thread_block block = cg::this_thread_block(); - if (idx < M) { - int_t i = idx; - u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2]; - u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)]; - } - - if (idx < N) { - int_t j = idx; - u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)]; - u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)]; - } -} -// END: T6 - -// TASK: T5 -// Integration formula -// BEGIN: T5 -__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt, - int_t M, int_t N, real_t c, real_t dt, - real_t dx, real_t dy) { int_t i = blockIdx.y * blockDim.y + threadIdx.y; int_t j = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= M || j >= N) - return; + // time step + if (i < M && j < N) { + int_t idx = (i + 1) * (N + 2) + (j + 1); + int_t idx_up = (i + 2) * (N + 2) + (j + 1); + int_t idx_down = (i) * (N + 2) + (j + 1); + int_t idx_right = (i + 1) * (N + 2) + (j + 2); + int_t idx_left = (i + 1) * (N + 2) + (j); - int_t idx = (i + 1) * (N + 2) + (j + 1); - int_t idx_up = (i + 2) * (N + 2) + (j + 1); - int_t idx_down = (i) * (N + 2) + (j + 1); - int_t idx_right = (i + 1) * (N + 2) + (j + 2); - int_t idx_left = (i + 1) * (N + 2) + (j); + real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx); + real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy); - real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx); - real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy); + u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2); + } - u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2); + grid.sync(); + + int_t linear_idx = blockIdx.x * blockDim.x * blockDim.y + + threadIdx.y * blockDim.x + threadIdx.x; + + // boundary condition + if (linear_idx < M) { + int_t row = linear_idx; + u_nxt[(row + 1) * (N + 2) + 0] = u_nxt[(row + 1) * (N + 2) + 2]; + u_nxt[(row + 1) * (N + 2) + (N + 1)] = u_nxt[(row + 1) * (N + 2) + (N - 1)]; + } + + if (linear_idx < N) { + int_t col = linear_idx; + u_nxt[0 * (N + 2) + (col + 1)] = u_nxt[2 * (N + 2) + (col + 1)]; + u_nxt[(M + 1) * (N + 2) + (col + 1)] = u_nxt[(M - 1) * (N + 2) + (col + 1)]; + } } -// END: T5 -// TASK: T7 -// Main time integration. void simulate(void) { - // BEGIN: T7 dim3 blockDim(BLOCKX, BLOCKY); dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y); - int_t boundary_threads = M > N ? M : N; - int_t boundary_blocks = (boundary_threads + BLOCKX * BLOCKY - 1) / (BLOCKX * BLOCKY); - - // Save initial state size_t size = (M + 2) * (N + 2) * sizeof(real_t); cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost); domain_save(0); - // Go through each time step + void *kernelArgs[] = { + (void *)&buffers[0], (void *)&buffers[1], (void *)&buffers[2], + (void *)&M, (void *)&N, (void *)&c, (void *)&dt, (void *)&dx, (void *)&dy + }; + for (int_t iteration = 1; iteration <= max_iteration; iteration++) { - time_step<<>>(buffers[0], buffers[1], buffers[2], - M, N, c, dt, dx, dy); + cudaLaunchCooperativeKernel( + (void *)wave_equation_step, + gridDim, blockDim, + kernelArgs); - boundary_condition<<>>(buffers[2], M, N); - - cudaDeviceSynchronize(); + cudaErrorCheck(cudaGetLastError()); + cudaErrorCheck(cudaDeviceSynchronize()); move_buffer_window(); - // Save snapshots at specified frequency if (iteration % snapshot_freq == 0) { cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost); domain_save(iteration / snapshot_freq); } } - // END: T7 } -// TASK: T8 -// GPU occupancy void occupancy(void) { - // BEGIN: T8 cudaDeviceProp p; cudaGetDeviceProperties(&p, 0); dim3 blockDim(BLOCKX, BLOCKY); int threads_per_block = blockDim.x * blockDim.y; - - // Calculate grid dimensions dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (N + BLOCKY - 1) / BLOCKY); printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y); @@ -222,22 +178,16 @@ void occupancy(void) { int active_warps = max_blocks_per_sm * warps_per_block; real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm; - if (occupancy_ratio > 1.0) occupancy_ratio = 1.0; printf("Theoretical occupancy: %.6f\n", occupancy_ratio); - // END: T8 } -// TASK: T2 -// Make sure at least one CUDA-capable device exists static bool init_cuda() { - // BEGIN: T2 int count; if (cudaGetDeviceCount(&count) != cudaSuccess) return false; - printf("CUDA device count: %d\n", count); if (count > 0) { @@ -247,6 +197,12 @@ static bool init_cuda() { if (cudaGetDeviceProperties(&p, 0) != cudaSuccess) return false; + // Check cooperative launch support + if (!p.cooperativeLaunch) { + fprintf(stderr, "Device does not support cooperative kernel launch!\n"); + return false; + } + printf("CUDA device #0:\n"); printf(" Name: %s\n", p.name); printf(" Compute capability: %d.%d\n", p.major, p.minor); @@ -255,21 +211,15 @@ static bool init_cuda() { printf(" Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); printf(" Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0); printf(" Per-block registers: %d\n", p.regsPerBlock); + printf(" Cooperative launch: %s\n", p.cooperativeLaunch ? "YES" : "NO"); } return true; - // END: T2 } -// TASK: T3 -// Set up our three buffers, and fill two with an initial perturbation -// Function to determine occupancy and optimal configuration - void domain_initialize(void) { - // BEGIN: T3 bool locate_cuda = init_cuda(); - if (!locate_cuda) { + if (!locate_cuda) exit(EXIT_FAILURE); - } size_t size = (M + 2) * (N + 2) * sizeof(real_t); @@ -281,7 +231,6 @@ void domain_initialize(void) { for (int_t i = 0; i < M; i++) { for (int_t j = 0; j < N; j++) { - // Calculate delta (radial distance) adjusted for M x N grid real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M + ((j - N / 2.0) * (j - N / 2.0)) / (real_t)N); U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta); @@ -292,26 +241,20 @@ void domain_initialize(void) { cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice); cudaMemset(buffers[2], 0, size); - // Set the time step for 2D case dt = dx * dy / (c * sqrt(dx * dx + dy * dy)); } int main(void) { - // Set up the initial state of the domain domain_initialize(); struct timeval t_start, t_end; - gettimeofday(&t_start, NULL); simulate(); gettimeofday(&t_end, NULL); - printf("Total elapsed time: %lf seconds\n", - WALLTIME(t_end) - WALLTIME(t_start)); - + printf("Total elapsed time: %lf seconds\n", WALLTIME(t_end) - WALLTIME(t_start)); occupancy(); - // Clean up and shut down domain_finalize(); exit(EXIT_SUCCESS); }