ex7: vibe
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <driver_types.h>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
@@ -13,7 +14,6 @@
|
||||
// Include the cooperative groups library
|
||||
// BEGIN: T1
|
||||
#include <cooperative_groups.h>
|
||||
namespace cg = cooperative_groups; // TODO
|
||||
// END: T1
|
||||
|
||||
// Convert 'struct timeval' into seconds in double prec. floating point
|
||||
@@ -34,6 +34,9 @@ int_t
|
||||
max_iteration = 1000000,
|
||||
snapshot_freq = 1000;
|
||||
|
||||
#define BLOCKX 16
|
||||
#define BLOCKY 16
|
||||
|
||||
// Wave equation parameters, time step is derived from the space step
|
||||
const real_t
|
||||
c = 1.0,
|
||||
@@ -43,8 +46,8 @@ real_t
|
||||
dt;
|
||||
|
||||
// Buffers for three time steps, indexed with 2 ghost points for the boundary
|
||||
real_t
|
||||
*buffers[3] = { NULL, NULL, NULL };
|
||||
real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers
|
||||
real_t *h_buffer = NULL;
|
||||
|
||||
#define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1]
|
||||
#define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1]
|
||||
@@ -109,23 +112,29 @@ void domain_save(int_t step) {
|
||||
// Get rid of all the memory allocations
|
||||
void domain_finalize(void) {
|
||||
// BEGIN: T4
|
||||
free(buffers[0]);
|
||||
free(buffers[1]);
|
||||
free(buffers[2]);
|
||||
cudaFree(buffers[0]);
|
||||
cudaFree(buffers[1]);
|
||||
cudaFree(buffers[2]);
|
||||
free(h_buffer);
|
||||
// END: T4
|
||||
}
|
||||
|
||||
// TASK: T6
|
||||
// Neumann (reflective) boundary condition
|
||||
// BEGIN: T6
|
||||
void boundary_condition(void) {
|
||||
for (int_t i = 0; i < M; i++) {
|
||||
U(i, -1) = U(i, 1);
|
||||
U(i, N) = U(i, N - 2);
|
||||
__global__ void boundary_condition(real_t *u, int_t M, int_t N) {
|
||||
int_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < M) {
|
||||
int_t i = idx;
|
||||
u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2];
|
||||
u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)];
|
||||
}
|
||||
for (int_t j = 0; j < N; j++) {
|
||||
U(-1, j) = U(1, j);
|
||||
U(M, j) = U(M - 2, j);
|
||||
|
||||
if (idx < N) {
|
||||
int_t j = idx;
|
||||
u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)];
|
||||
u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)];
|
||||
}
|
||||
}
|
||||
// END: T6
|
||||
@@ -133,12 +142,25 @@ void boundary_condition(void) {
|
||||
// TASK: T5
|
||||
// Integration formula
|
||||
// BEGIN: T5
|
||||
void time_step(void) {
|
||||
for (int_t i = 0; i < M; i++) {
|
||||
for (int_t j = 0; j < N; j++) {
|
||||
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (dx * dy) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
|
||||
}
|
||||
}
|
||||
__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt,
|
||||
int_t M, int_t N, real_t c, real_t dt,
|
||||
real_t dx, real_t dy) {
|
||||
int_t i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int_t j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (i >= M || j >= N)
|
||||
return;
|
||||
|
||||
int_t idx = (i + 1) * (N + 2) + (j + 1);
|
||||
int_t idx_up = (i + 2) * (N + 2) + (j + 1);
|
||||
int_t idx_down = (i) * (N + 2) + (j + 1);
|
||||
int_t idx_right = (i + 1) * (N + 2) + (j + 2);
|
||||
int_t idx_left = (i + 1) * (N + 2) + (j);
|
||||
|
||||
real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
|
||||
real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
|
||||
|
||||
u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
|
||||
}
|
||||
// END: T5
|
||||
|
||||
@@ -147,18 +169,21 @@ void time_step(void) {
|
||||
void simulate(void) {
|
||||
// BEGIN: T7
|
||||
// Go through each time step
|
||||
for (int_t iteration = 0; iteration <= max_iteration; iteration++) {
|
||||
if ((iteration % snapshot_freq) == 0) {
|
||||
domain_save(iteration / snapshot_freq);
|
||||
}
|
||||
dim3 blockDim(16, 16);
|
||||
dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
|
||||
(M + blockDim.y - 1) / blockDim.y);
|
||||
|
||||
// Derive step t+1 from steps t and t-1
|
||||
boundary_condition();
|
||||
time_step();
|
||||
int_t boundary_threads = M > N ? M : N;
|
||||
int_t boundary_blocks = (boundary_threads + 255) / 256;
|
||||
|
||||
// Rotate the time step buffers
|
||||
move_buffer_window();
|
||||
}
|
||||
time_step<<<gridDim, blockDim>>>(buffers[0], buffers[1], buffers[2],
|
||||
M, N, c, dt, dx, dy);
|
||||
|
||||
boundary_condition<<<boundary_blocks, 256>>>(buffers[2], M, N);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
move_buffer_window();
|
||||
// END: T7
|
||||
}
|
||||
|
||||
@@ -166,7 +191,29 @@ void simulate(void) {
|
||||
// GPU occupancy
|
||||
void occupancy(void) {
|
||||
// BEGIN: T8
|
||||
;
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, 0);
|
||||
|
||||
dim3 blockDim(BLOCKX, BLOCKY); // 256 threads per block
|
||||
int threads_per_block = blockDim.x * blockDim.y;
|
||||
|
||||
int warps_per_block = (threads_per_block + 31) / 32;
|
||||
|
||||
int max_warps_per_sm = prop.maxThreadsPerMultiProcessor / 32;
|
||||
|
||||
int max_blocks_per_sm = prop.maxThreadsPerMultiProcessor / threads_per_block;
|
||||
int active_warps = max_blocks_per_sm * warps_per_block;
|
||||
|
||||
real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
|
||||
|
||||
if (occupancy_ratio > 1.0)
|
||||
occupancy_ratio = 1.0;
|
||||
|
||||
printf("GPU Occupancy: %.2f%%\n", occupancy_ratio * 100.0);
|
||||
printf("Active warps per SM: %d\n", active_warps);
|
||||
printf("Maximum warps per SM: %d\n", max_warps_per_sm);
|
||||
printf("Threads per block: %d\n", threads_per_block);
|
||||
printf("Max blocks per SM: %d\n", max_blocks_per_sm);
|
||||
// END: T8
|
||||
}
|
||||
|
||||
@@ -211,9 +258,13 @@ void domain_initialize(void) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
buffers[0] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
|
||||
buffers[1] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
|
||||
buffers[2] = (real_t *)malloc((M + 2) * (N + 2) * sizeof(real_t));
|
||||
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
|
||||
|
||||
cudaMalloc(&buffers[0], size);
|
||||
cudaMalloc(&buffers[1], size);
|
||||
cudaMalloc(&buffers[2], size);
|
||||
|
||||
h_buffer = (real_t *)malloc(size);
|
||||
|
||||
for (int_t i = 0; i < M; i++) {
|
||||
for (int_t j = 0; j < N; j++) {
|
||||
@@ -224,6 +275,10 @@ void domain_initialize(void) {
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemset(buffers[2], 0, size);
|
||||
|
||||
// Set the time step for 2D case
|
||||
dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user