Files
TDT4200/exercise7/wave_2d_parallel.cu

261 lines
7.6 KiB
Plaintext

#include <cooperative_groups.h>
#include <cuda_runtime_api.h>
#include <driver_types.h>
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
namespace cg = cooperative_groups;
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)
typedef int64_t int_t;
typedef double real_t;
int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000;
#define BLOCKX 16
#define BLOCKY 16
const real_t c = 1.0, dx = 1.0, dy = 1.0;
real_t dt;
real_t *buffers[3] = { NULL, NULL, NULL };
real_t *h_buffer = NULL;
#define U_prv(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
#define U(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
#define U_nxt(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
#define cudaErrorCheck(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
void move_buffer_window(void) {
real_t *temp = buffers[0];
buffers[0] = buffers[1];
buffers[1] = buffers[2];
buffers[2] = temp;
}
void domain_save(int_t step) {
char filename[256];
if (mkdir("data", 0755) != 0 && errno != EEXIST) {
perror("mkdir data");
exit(EXIT_FAILURE);
}
snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step);
FILE *out = fopen(filename, "wb");
if (out == NULL) {
perror("fopen output file");
fprintf(stderr, "Failed to open '%s' for writing.\n", filename);
exit(EXIT_FAILURE);
}
for (int_t i = 0; i < M; ++i) {
size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out);
if (written != (size_t)N) {
perror("fwrite");
fclose(out);
exit(EXIT_FAILURE);
}
}
if (fclose(out) != 0) {
perror("fclose");
exit(EXIT_FAILURE);
}
}
void domain_finalize(void) {
cudaFree(buffers[0]);
cudaFree(buffers[1]);
cudaFree(buffers[2]);
free(h_buffer);
}
// combined kernel for both time step and boundary condition
__global__ void wave_equation_step(real_t *u_prv, real_t *u, real_t *u_nxt,
int_t M, int_t N, real_t c, real_t dt,
real_t dx, real_t dy) {
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
int_t i = blockIdx.y * blockDim.y + threadIdx.y;
int_t j = blockIdx.x * blockDim.x + threadIdx.x;
// time step
if (i < M && j < N) {
int_t idx = (i + 1) * (N + 2) + (j + 1);
int_t idx_up = (i + 2) * (N + 2) + (j + 1);
int_t idx_down = (i) * (N + 2) + (j + 1);
int_t idx_right = (i + 1) * (N + 2) + (j + 2);
int_t idx_left = (i + 1) * (N + 2) + (j);
real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
}
grid.sync();
int_t linear_idx = blockIdx.x * blockDim.x * blockDim.y +
threadIdx.y * blockDim.x + threadIdx.x;
// boundary condition
if (linear_idx < M) {
int_t row = linear_idx;
u_nxt[(row + 1) * (N + 2) + 0] = u_nxt[(row + 1) * (N + 2) + 2];
u_nxt[(row + 1) * (N + 2) + (N + 1)] = u_nxt[(row + 1) * (N + 2) + (N - 1)];
}
if (linear_idx < N) {
int_t col = linear_idx;
u_nxt[0 * (N + 2) + (col + 1)] = u_nxt[2 * (N + 2) + (col + 1)];
u_nxt[(M + 1) * (N + 2) + (col + 1)] = u_nxt[(M - 1) * (N + 2) + (col + 1)];
}
}
void simulate(void) {
dim3 blockDim(BLOCKX, BLOCKY);
dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
(M + blockDim.y - 1) / blockDim.y);
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost);
domain_save(0);
void *kernelArgs[] = {
(void *)&buffers[0], (void *)&buffers[1], (void *)&buffers[2],
(void *)&M, (void *)&N, (void *)&c, (void *)&dt, (void *)&dx, (void *)&dy
};
for (int_t iteration = 1; iteration <= max_iteration; iteration++) {
cudaLaunchCooperativeKernel(
(void *)wave_equation_step,
gridDim, blockDim,
kernelArgs);
cudaErrorCheck(cudaGetLastError());
cudaErrorCheck(cudaDeviceSynchronize());
move_buffer_window();
if (iteration % snapshot_freq == 0) {
cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost);
domain_save(iteration / snapshot_freq);
}
}
}
void occupancy(void) {
cudaDeviceProp p;
cudaGetDeviceProperties(&p, 0);
dim3 blockDim(BLOCKX, BLOCKY);
int threads_per_block = blockDim.x * blockDim.y;
dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (N + BLOCKY - 1) / BLOCKY);
printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y);
printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY);
int warps_per_block = (threads_per_block + 31) / 32;
int max_warps_per_sm = p.maxThreadsPerMultiProcessor / 32;
int max_blocks_per_sm = p.maxThreadsPerMultiProcessor / threads_per_block;
int active_warps = max_blocks_per_sm * warps_per_block;
real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
if (occupancy_ratio > 1.0)
occupancy_ratio = 1.0;
printf("Theoretical occupancy: %.6f\n", occupancy_ratio);
}
static bool init_cuda() {
int count;
if (cudaGetDeviceCount(&count) != cudaSuccess)
return false;
printf("CUDA device count: %d\n", count);
if (count > 0) {
cudaDeviceProp p;
if (cudaSetDevice(0) != cudaSuccess)
return false;
if (cudaGetDeviceProperties(&p, 0) != cudaSuccess)
return false;
// Check cooperative launch support
if (!p.cooperativeLaunch) {
fprintf(stderr, "Device does not support cooperative kernel launch!\n");
return false;
}
printf("CUDA device #0:\n");
printf(" Name: %s\n", p.name);
printf(" Compute capability: %d.%d\n", p.major, p.minor);
printf(" Multiprocessors: %d\n", p.multiProcessorCount);
printf(" Warp size: %d\n", p.warpSize);
printf(" Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
printf(" Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0);
printf(" Per-block registers: %d\n", p.regsPerBlock);
printf(" Cooperative launch: %s\n", p.cooperativeLaunch ? "YES" : "NO");
}
return true;
}
void domain_initialize(void) {
bool locate_cuda = init_cuda();
if (!locate_cuda)
exit(EXIT_FAILURE);
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
cudaMalloc(&buffers[0], size);
cudaMalloc(&buffers[1], size);
cudaMalloc(&buffers[2], size);
h_buffer = (real_t *)malloc(size);
for (int_t i = 0; i < M; i++) {
for (int_t j = 0; j < N; j++) {
real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M +
((j - N / 2.0) * (j - N / 2.0)) / (real_t)N);
U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
}
}
cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
cudaMemset(buffers[2], 0, size);
dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
}
int main(void) {
domain_initialize();
struct timeval t_start, t_end;
gettimeofday(&t_start, NULL);
simulate();
gettimeofday(&t_end, NULL);
printf("Total elapsed time: %lf seconds\n", WALLTIME(t_end) - WALLTIME(t_start));
occupancy();
domain_finalize();
exit(EXIT_SUCCESS);
}