#include #include #include #include #include #include #include #include #include #include #include #include namespace cg = cooperative_groups; #define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec) typedef int64_t int_t; typedef double real_t; int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000; #define BLOCKX 8 #define BLOCKY 8 #define IDX2D(i, j, stride) (((i) + 1) * (stride) + (j) + 1) #define HOST_U(buffer, i, j) buffer[IDX2D(i, j, N + 2)] #define DEVICE_IDX(i, j, stride) (((i) + 1) * (stride) + (j) + 1) const real_t c = 1.0, dx = 1.0, dy = 1.0; real_t dt; real_t *d_buffers[3] = { NULL, NULL, NULL }; real_t *h_buffer = NULL; #define cudaErrorCheck(ans) \ { \ gpuAssert((ans), __FILE__, __LINE__); \ } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } void move_buffer_window(void) { real_t *temp = d_buffers[0]; d_buffers[0] = d_buffers[1]; d_buffers[1] = d_buffers[2]; d_buffers[2] = temp; } void domain_save(int_t step) { char filename[256]; if (mkdir("data", 0755) != 0 && errno != EEXIST) { perror("mkdir data"); exit(EXIT_FAILURE); } snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step); FILE *out = fopen(filename, "wb"); if (out == NULL) { perror("fopen output file"); fprintf(stderr, "Failed to open '%s' for writing.\n", filename); exit(EXIT_FAILURE); } for (int_t i = 0; i < M; ++i) { size_t written = fwrite(&HOST_U(h_buffer, i, 0), sizeof(real_t), (size_t)N, out); if (written != (size_t)N) { perror("fwrite"); fclose(out); exit(EXIT_FAILURE); } } if (fclose(out) != 0) { perror("fclose"); exit(EXIT_FAILURE); } } void domain_finalize(void) { cudaFree(d_buffers[0]); cudaFree(d_buffers[1]); cudaFree(d_buffers[2]); cudaFreeHost(h_buffer); } __global__ void wave_equation_step(real_t *__restrict__ d_u_prv, real_t *__restrict__ d_u, real_t *__restrict__ d_u_nxt, int_t M, int_t N, real_t coeff) { cg::grid_group grid = cg::this_grid(); int_t i = blockIdx.y * blockDim.y + threadIdx.y; int_t j = blockIdx.x * blockDim.x + threadIdx.x; int_t stride = N + 2; if (i < M && j < N) { int_t idx = DEVICE_IDX(i, j, stride); real_t u_center = d_u[idx]; real_t u_up = d_u[idx + stride]; real_t u_down = d_u[idx - stride]; real_t u_right = d_u[idx + 1]; real_t u_left = d_u[idx - 1]; real_t laplacian = u_right + u_left + u_up + u_down - 4.0 * u_center; d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian; } grid.sync(); int_t global_thread_idx = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) + threadIdx.y * blockDim.x + threadIdx.x; if (global_thread_idx < M) { int_t row = global_thread_idx; int_t row_offset = (row + 1) * stride; d_u_nxt[row_offset] = d_u_nxt[row_offset + 2]; d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1]; } if (global_thread_idx < N) { int_t col = global_thread_idx; d_u_nxt[col + 1] = d_u_nxt[2 * stride + col + 1]; d_u_nxt[(M + 1) * stride + col + 1] = d_u_nxt[(M - 1) * stride + col + 1]; } } void simulate(void) { dim3 blockDim(BLOCKX, BLOCKY); dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (M + BLOCKY - 1) / BLOCKY); real_t coeff = (c * dt) * (c * dt) / (dx * dx); size_t size = (M + 2) * (N + 2) * sizeof(real_t); cudaStream_t stream; cudaStreamCreate(&stream); cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); domain_save(0); void *kernelArgs[] = { (void *)&d_buffers[0], (void *)&d_buffers[1], (void *)&d_buffers[2], (void *)&M, (void *)&N, (void *)&coeff }; for (int_t iteration = 1; iteration <= max_iteration; iteration++) { cudaLaunchCooperativeKernel( (void *)wave_equation_step, gridDim, blockDim, kernelArgs, 0, stream); move_buffer_window(); if (iteration % snapshot_freq == 0) { cudaStreamSynchronize(stream); cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); domain_save(iteration / snapshot_freq); } } cudaStreamSynchronize(stream); cudaStreamDestroy(stream); } void occupancy(void) { cudaDeviceProp p; cudaGetDeviceProperties(&p, 0); dim3 blockDim(BLOCKX, BLOCKY); int threads_per_block = blockDim.x * blockDim.y; dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (M + BLOCKY - 1) / BLOCKY); printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y); printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY); int numBlocksPerSm = 0; cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocksPerSm, wave_equation_step, threads_per_block, 0); int activeWarps = numBlocksPerSm * ((threads_per_block + 31) / 32); int maxWarps = p.maxThreadsPerMultiProcessor / 32; real_t occupancy_ratio = (real_t)activeWarps / (real_t)maxWarps; printf("Theoretical occupancy: %.6f\n", occupancy_ratio); } static bool init_cuda() { int count; if (cudaGetDeviceCount(&count) != cudaSuccess) return false; printf("CUDA device count: %d\n", count); if (count > 0) { cudaDeviceProp p; if (cudaSetDevice(0) != cudaSuccess) return false; if (cudaGetDeviceProperties(&p, 0) != cudaSuccess) return false; if (!p.cooperativeLaunch) { fprintf(stderr, "Device does not support cooperative kernel launch!\n"); return false; } printf("CUDA device #0:\n"); printf(" Name: %s\n", p.name); printf(" Compute capability: %d.%d\n", p.major, p.minor); printf(" Multiprocessors: %d\n", p.multiProcessorCount); printf(" Warp size: %d\n", p.warpSize); printf(" Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); printf(" Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0); printf(" Per-block registers: %d\n", p.regsPerBlock); printf(" Cooperative launch: %s\n", p.cooperativeLaunch ? "YES" : "NO"); } return true; } void domain_initialize(void) { bool locate_cuda = init_cuda(); if (!locate_cuda) exit(EXIT_FAILURE); size_t size = (M + 2) * (N + 2) * sizeof(real_t); cudaMalloc(&d_buffers[0], size); cudaMalloc(&d_buffers[1], size); cudaMalloc(&d_buffers[2], size); cudaHostAlloc(&h_buffer, size, cudaHostAllocDefault); for (int_t i = 0; i < M; i++) { for (int_t j = 0; j < N; j++) { real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M + ((j - N / 2.0) * (j - N / 2.0)) / (real_t)N); HOST_U(h_buffer, i, j) = exp(-4.0 * delta * delta); } } cudaMemcpy(d_buffers[0], h_buffer, size, cudaMemcpyHostToDevice); cudaMemcpy(d_buffers[1], h_buffer, size, cudaMemcpyHostToDevice); cudaMemset(d_buffers[2], 0, size); dt = dx * dy / (c * sqrt(dx * dx + dy * dy)); } int main(void) { domain_initialize(); struct timeval t_start, t_end; gettimeofday(&t_start, NULL); simulate(); gettimeofday(&t_end, NULL); printf("Total elapsed time: %lf seconds\n", WALLTIME(t_end) - WALLTIME(t_start)); occupancy(); domain_finalize(); exit(EXIT_SUCCESS); }