270 lines
7.6 KiB
Plaintext
270 lines
7.6 KiB
Plaintext
#include <cooperative_groups.h>
|
|
#include <cuda_runtime_api.h>
|
|
#include <driver_types.h>
|
|
#include <errno.h>
|
|
#include <inttypes.h>
|
|
#include <math.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/time.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
|
|
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)
|
|
|
|
typedef int64_t int_t;
|
|
typedef double real_t;
|
|
|
|
int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000;
|
|
|
|
#define BLOCKX 8
|
|
#define BLOCKY 8
|
|
|
|
#define IDX2D(i, j, stride) (((i) + 1) * (stride) + (j) + 1)
|
|
#define HOST_U(buffer, i, j) buffer[IDX2D(i, j, N + 2)]
|
|
#define DEVICE_IDX(i, j, stride) (((i) + 1) * (stride) + (j) + 1)
|
|
|
|
const real_t c = 1.0, dx = 1.0, dy = 1.0;
|
|
real_t dt;
|
|
|
|
real_t *d_buffers[3] = { NULL, NULL, NULL };
|
|
real_t *h_buffer = NULL;
|
|
|
|
#define cudaErrorCheck(ans) \
|
|
{ \
|
|
gpuAssert((ans), __FILE__, __LINE__); \
|
|
}
|
|
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
|
|
if (code != cudaSuccess) {
|
|
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
|
if (abort)
|
|
exit(code);
|
|
}
|
|
}
|
|
|
|
void move_buffer_window(void) {
|
|
real_t *temp = d_buffers[0];
|
|
d_buffers[0] = d_buffers[1];
|
|
d_buffers[1] = d_buffers[2];
|
|
d_buffers[2] = temp;
|
|
}
|
|
|
|
void domain_save(int_t step) {
|
|
char filename[256];
|
|
if (mkdir("data", 0755) != 0 && errno != EEXIST) {
|
|
perror("mkdir data");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step);
|
|
FILE *out = fopen(filename, "wb");
|
|
if (out == NULL) {
|
|
perror("fopen output file");
|
|
fprintf(stderr, "Failed to open '%s' for writing.\n", filename);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
for (int_t i = 0; i < M; ++i) {
|
|
size_t written = fwrite(&HOST_U(h_buffer, i, 0), sizeof(real_t), (size_t)N, out);
|
|
if (written != (size_t)N) {
|
|
perror("fwrite");
|
|
fclose(out);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
if (fclose(out) != 0) {
|
|
perror("fclose");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
void domain_finalize(void) {
|
|
cudaFree(d_buffers[0]);
|
|
cudaFree(d_buffers[1]);
|
|
cudaFree(d_buffers[2]);
|
|
cudaFreeHost(h_buffer);
|
|
}
|
|
|
|
__global__ void wave_equation_step(real_t *__restrict__ d_u_prv,
|
|
real_t *__restrict__ d_u,
|
|
real_t *__restrict__ d_u_nxt,
|
|
int_t M, int_t N, real_t coeff) {
|
|
cg::grid_group grid = cg::this_grid();
|
|
|
|
int_t i = blockIdx.y * blockDim.y + threadIdx.y;
|
|
int_t j = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int_t stride = N + 2;
|
|
|
|
if (i < M && j < N) {
|
|
int_t idx = DEVICE_IDX(i, j, stride);
|
|
|
|
real_t u_center = d_u[idx];
|
|
real_t u_up = d_u[idx + stride];
|
|
real_t u_down = d_u[idx - stride];
|
|
real_t u_right = d_u[idx + 1];
|
|
real_t u_left = d_u[idx - 1];
|
|
|
|
real_t laplacian = u_right + u_left + u_up + u_down - 4.0 * u_center;
|
|
d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian;
|
|
}
|
|
|
|
grid.sync();
|
|
|
|
int_t global_thread_idx = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) +
|
|
threadIdx.y * blockDim.x + threadIdx.x;
|
|
|
|
if (global_thread_idx < M) {
|
|
int_t row = global_thread_idx;
|
|
int_t row_offset = (row + 1) * stride;
|
|
d_u_nxt[row_offset] = d_u_nxt[row_offset + 2];
|
|
d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1];
|
|
}
|
|
|
|
if (global_thread_idx < N) {
|
|
int_t col = global_thread_idx;
|
|
d_u_nxt[col + 1] = d_u_nxt[2 * stride + col + 1];
|
|
d_u_nxt[(M + 1) * stride + col + 1] = d_u_nxt[(M - 1) * stride + col + 1];
|
|
}
|
|
}
|
|
|
|
void simulate(void) {
|
|
dim3 blockDim(BLOCKX, BLOCKY);
|
|
dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (M + BLOCKY - 1) / BLOCKY);
|
|
|
|
real_t coeff = (c * dt) * (c * dt) / (dx * dx);
|
|
|
|
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
|
|
|
|
cudaStream_t stream;
|
|
cudaStreamCreate(&stream);
|
|
|
|
cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
domain_save(0);
|
|
|
|
void *kernelArgs[] = {
|
|
(void *)&d_buffers[0], (void *)&d_buffers[1], (void *)&d_buffers[2],
|
|
(void *)&M, (void *)&N, (void *)&coeff
|
|
};
|
|
|
|
for (int_t iteration = 1; iteration <= max_iteration; iteration++) {
|
|
cudaLaunchCooperativeKernel(
|
|
(void *)wave_equation_step,
|
|
gridDim, blockDim,
|
|
kernelArgs, 0, stream);
|
|
|
|
move_buffer_window();
|
|
|
|
if (iteration % snapshot_freq == 0) {
|
|
cudaStreamSynchronize(stream);
|
|
cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
domain_save(iteration / snapshot_freq);
|
|
}
|
|
}
|
|
|
|
cudaStreamSynchronize(stream);
|
|
cudaStreamDestroy(stream);
|
|
}
|
|
|
|
void occupancy(void) {
|
|
cudaDeviceProp p;
|
|
cudaGetDeviceProperties(&p, 0);
|
|
|
|
dim3 blockDim(BLOCKX, BLOCKY);
|
|
int threads_per_block = blockDim.x * blockDim.y;
|
|
dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (M + BLOCKY - 1) / BLOCKY);
|
|
|
|
printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y);
|
|
printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY);
|
|
|
|
int numBlocksPerSm = 0;
|
|
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
|
&numBlocksPerSm,
|
|
wave_equation_step,
|
|
threads_per_block,
|
|
0);
|
|
|
|
int activeWarps = numBlocksPerSm * ((threads_per_block + 31) / 32);
|
|
int maxWarps = p.maxThreadsPerMultiProcessor / 32;
|
|
|
|
real_t occupancy_ratio = (real_t)activeWarps / (real_t)maxWarps;
|
|
|
|
printf("Theoretical occupancy: %.6f\n", occupancy_ratio);
|
|
}
|
|
|
|
static bool init_cuda() {
|
|
int count;
|
|
if (cudaGetDeviceCount(&count) != cudaSuccess)
|
|
return false;
|
|
printf("CUDA device count: %d\n", count);
|
|
|
|
if (count > 0) {
|
|
cudaDeviceProp p;
|
|
if (cudaSetDevice(0) != cudaSuccess)
|
|
return false;
|
|
if (cudaGetDeviceProperties(&p, 0) != cudaSuccess)
|
|
return false;
|
|
|
|
if (!p.cooperativeLaunch) {
|
|
fprintf(stderr, "Device does not support cooperative kernel launch!\n");
|
|
return false;
|
|
}
|
|
|
|
printf("CUDA device #0:\n");
|
|
printf(" Name: %s\n", p.name);
|
|
printf(" Compute capability: %d.%d\n", p.major, p.minor);
|
|
printf(" Multiprocessors: %d\n", p.multiProcessorCount);
|
|
printf(" Warp size: %d\n", p.warpSize);
|
|
printf(" Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
|
|
printf(" Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0);
|
|
printf(" Per-block registers: %d\n", p.regsPerBlock);
|
|
printf(" Cooperative launch: %s\n", p.cooperativeLaunch ? "YES" : "NO");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void domain_initialize(void) {
|
|
bool locate_cuda = init_cuda();
|
|
if (!locate_cuda)
|
|
exit(EXIT_FAILURE);
|
|
|
|
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
|
|
|
|
cudaMalloc(&d_buffers[0], size);
|
|
cudaMalloc(&d_buffers[1], size);
|
|
cudaMalloc(&d_buffers[2], size);
|
|
|
|
cudaHostAlloc(&h_buffer, size, cudaHostAllocDefault);
|
|
|
|
for (int_t i = 0; i < M; i++) {
|
|
for (int_t j = 0; j < N; j++) {
|
|
real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M +
|
|
((j - N / 2.0) * (j - N / 2.0)) / (real_t)N);
|
|
HOST_U(h_buffer, i, j) = exp(-4.0 * delta * delta);
|
|
}
|
|
}
|
|
|
|
cudaMemcpy(d_buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
|
|
cudaMemset(d_buffers[2], 0, size);
|
|
|
|
dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
|
|
}
|
|
|
|
int main(void) {
|
|
domain_initialize();
|
|
|
|
struct timeval t_start, t_end;
|
|
gettimeofday(&t_start, NULL);
|
|
simulate();
|
|
gettimeofday(&t_end, NULL);
|
|
|
|
printf("Total elapsed time: %lf seconds\n", WALLTIME(t_end) - WALLTIME(t_start));
|
|
occupancy();
|
|
|
|
domain_finalize();
|
|
exit(EXIT_SUCCESS);
|
|
}
|