From dc8242d7f49e12310553ec39760b0a4bd304d2bb Mon Sep 17 00:00:00 2001 From: fredrikr79 Date: Tue, 4 Nov 2025 15:53:50 +0100 Subject: [PATCH] ex7: refactor to use d_ prefixes etc --- exercise7/wave_2d_parallel.cu | 76 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu index dedcbbf..ce2fbc6 100644 --- a/exercise7/wave_2d_parallel.cu +++ b/exercise7/wave_2d_parallel.cu @@ -23,16 +23,16 @@ int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000; #define BLOCKX 8 #define BLOCKY 8 +#define IDX2D(i, j, stride) (((i) + 1) * (stride) + (j) + 1) +#define HOST_U(buffer, i, j) buffer[IDX2D(i, j, N + 2)] +#define DEVICE_IDX(i, j, stride) (((i) + 1) * (stride) + (j) + 1) + const real_t c = 1.0, dx = 1.0, dy = 1.0; real_t dt; -real_t *buffers[3] = { NULL, NULL, NULL }; +real_t *d_buffers[3] = { NULL, NULL, NULL }; real_t *h_buffer = NULL; -#define U_prv(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] -#define U(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] -#define U_nxt(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1] - #define cudaErrorCheck(ans) \ { \ gpuAssert((ans), __FILE__, __LINE__); \ @@ -46,10 +46,10 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = } void move_buffer_window(void) { - real_t *temp = buffers[0]; - buffers[0] = buffers[1]; - buffers[1] = buffers[2]; - buffers[2] = temp; + real_t *temp = d_buffers[0]; + d_buffers[0] = d_buffers[1]; + d_buffers[1] = d_buffers[2]; + d_buffers[2] = temp; } void domain_save(int_t step) { @@ -66,7 +66,7 @@ void domain_save(int_t step) { exit(EXIT_FAILURE); } for (int_t i = 0; i < M; ++i) { - size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out); + size_t written = fwrite(&HOST_U(h_buffer, i, 0), sizeof(real_t), (size_t)N, out); if (written != (size_t)N) { perror("fwrite"); fclose(out); @@ -80,15 +80,15 @@ void domain_save(int_t step) { } void domain_finalize(void) { - cudaFree(buffers[0]); - cudaFree(buffers[1]); - cudaFree(buffers[2]); + cudaFree(d_buffers[0]); + cudaFree(d_buffers[1]); + cudaFree(d_buffers[2]); cudaFreeHost(h_buffer); } -__global__ void wave_equation_step(real_t *__restrict__ u_prv, - real_t *__restrict__ u, - real_t *__restrict__ u_nxt, +__global__ void wave_equation_step(real_t *__restrict__ d_u_prv, + real_t *__restrict__ d_u, + real_t *__restrict__ d_u_nxt, int_t M, int_t N, real_t coeff) { cg::grid_group grid = cg::this_grid(); @@ -97,33 +97,31 @@ __global__ void wave_equation_step(real_t *__restrict__ u_prv, int_t stride = N + 2; if (i < M && j < N) { - int_t idx = (i + 1) * stride + (j + 1); + int_t idx = DEVICE_IDX(i, j, stride); - real_t u_center = u[idx]; - real_t u_up = u[idx + stride]; - real_t u_down = u[idx - stride]; - real_t u_right = u[idx + 1]; - real_t u_left = u[idx - 1]; + real_t u_center = d_u[idx]; + real_t u_up = d_u[idx + stride]; + real_t u_down = d_u[idx - stride]; + real_t u_right = d_u[idx + 1]; + real_t u_left = d_u[idx - 1]; real_t laplacian = u_right + u_left + u_up + u_down - 4.0 * u_center; - u_nxt[idx] = 2.0 * u_center - u_prv[idx] + coeff * laplacian; + d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian; } - // grid.sync(); - int_t linear_idx = blockIdx.y * gridDim.x * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; if (linear_idx < M) { int_t row_offset = (linear_idx + 1) * stride; - u_nxt[row_offset] = u_nxt[row_offset + 2]; - u_nxt[row_offset + N + 1] = u_nxt[row_offset + N - 1]; + d_u_nxt[row_offset] = d_u_nxt[row_offset + 2]; + d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1]; } if (linear_idx < N) { - u_nxt[linear_idx + 1] = u_nxt[2 * stride + linear_idx + 1]; - u_nxt[(M + 1) * stride + linear_idx + 1] = u_nxt[(M - 1) * stride + linear_idx + 1]; + d_u_nxt[linear_idx + 1] = d_u_nxt[2 * stride + linear_idx + 1]; + d_u_nxt[(M + 1) * stride + linear_idx + 1] = d_u_nxt[(M - 1) * stride + linear_idx + 1]; } } @@ -138,12 +136,12 @@ void simulate(void) { cudaStream_t stream; cudaStreamCreate(&stream); - cudaMemcpyAsync(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); domain_save(0); void *kernelArgs[] = { - (void *)&buffers[0], (void *)&buffers[1], (void *)&buffers[2], + (void *)&d_buffers[0], (void *)&d_buffers[1], (void *)&d_buffers[2], (void *)&M, (void *)&N, (void *)&coeff }; @@ -157,7 +155,7 @@ void simulate(void) { if (iteration % snapshot_freq == 0) { cudaStreamSynchronize(stream); - cudaMemcpyAsync(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); domain_save(iteration / snapshot_freq); } @@ -228,9 +226,9 @@ void domain_initialize(void) { size_t size = (M + 2) * (N + 2) * sizeof(real_t); - cudaMalloc(&buffers[0], size); - cudaMalloc(&buffers[1], size); - cudaMalloc(&buffers[2], size); + cudaMalloc(&d_buffers[0], size); + cudaMalloc(&d_buffers[1], size); + cudaMalloc(&d_buffers[2], size); cudaHostAlloc(&h_buffer, size, cudaHostAllocDefault); @@ -238,13 +236,13 @@ void domain_initialize(void) { for (int_t j = 0; j < N; j++) { real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M + ((j - N / 2.0) * (j - N / 2.0)) / (real_t)N); - U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta); + HOST_U(h_buffer, i, j) = exp(-4.0 * delta * delta); } } - cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice); - cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice); - cudaMemset(buffers[2], 0, size); + cudaMemcpy(d_buffers[0], h_buffer, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_buffers[1], h_buffer, size, cudaMemcpyHostToDevice); + cudaMemset(d_buffers[2], 0, size); dt = dx * dy / (c * sqrt(dx * dx + dy * dy)); }