ex7: refactor to use d_ prefixes etc
This commit is contained in:
@@ -23,16 +23,16 @@ int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000;
|
||||
#define BLOCKX 8
|
||||
#define BLOCKY 8
|
||||
|
||||
#define IDX2D(i, j, stride) (((i) + 1) * (stride) + (j) + 1)
|
||||
#define HOST_U(buffer, i, j) buffer[IDX2D(i, j, N + 2)]
|
||||
#define DEVICE_IDX(i, j, stride) (((i) + 1) * (stride) + (j) + 1)
|
||||
|
||||
const real_t c = 1.0, dx = 1.0, dy = 1.0;
|
||||
real_t dt;
|
||||
|
||||
real_t *buffers[3] = { NULL, NULL, NULL };
|
||||
real_t *d_buffers[3] = { NULL, NULL, NULL };
|
||||
real_t *h_buffer = NULL;
|
||||
|
||||
#define U_prv(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
|
||||
#define U(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
|
||||
#define U_nxt(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
|
||||
|
||||
#define cudaErrorCheck(ans) \
|
||||
{ \
|
||||
gpuAssert((ans), __FILE__, __LINE__); \
|
||||
@@ -46,10 +46,10 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
|
||||
}
|
||||
|
||||
void move_buffer_window(void) {
|
||||
real_t *temp = buffers[0];
|
||||
buffers[0] = buffers[1];
|
||||
buffers[1] = buffers[2];
|
||||
buffers[2] = temp;
|
||||
real_t *temp = d_buffers[0];
|
||||
d_buffers[0] = d_buffers[1];
|
||||
d_buffers[1] = d_buffers[2];
|
||||
d_buffers[2] = temp;
|
||||
}
|
||||
|
||||
void domain_save(int_t step) {
|
||||
@@ -66,7 +66,7 @@ void domain_save(int_t step) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
for (int_t i = 0; i < M; ++i) {
|
||||
size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out);
|
||||
size_t written = fwrite(&HOST_U(h_buffer, i, 0), sizeof(real_t), (size_t)N, out);
|
||||
if (written != (size_t)N) {
|
||||
perror("fwrite");
|
||||
fclose(out);
|
||||
@@ -80,15 +80,15 @@ void domain_save(int_t step) {
|
||||
}
|
||||
|
||||
void domain_finalize(void) {
|
||||
cudaFree(buffers[0]);
|
||||
cudaFree(buffers[1]);
|
||||
cudaFree(buffers[2]);
|
||||
cudaFree(d_buffers[0]);
|
||||
cudaFree(d_buffers[1]);
|
||||
cudaFree(d_buffers[2]);
|
||||
cudaFreeHost(h_buffer);
|
||||
}
|
||||
|
||||
__global__ void wave_equation_step(real_t *__restrict__ u_prv,
|
||||
real_t *__restrict__ u,
|
||||
real_t *__restrict__ u_nxt,
|
||||
__global__ void wave_equation_step(real_t *__restrict__ d_u_prv,
|
||||
real_t *__restrict__ d_u,
|
||||
real_t *__restrict__ d_u_nxt,
|
||||
int_t M, int_t N, real_t coeff) {
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
|
||||
@@ -97,33 +97,31 @@ __global__ void wave_equation_step(real_t *__restrict__ u_prv,
|
||||
int_t stride = N + 2;
|
||||
|
||||
if (i < M && j < N) {
|
||||
int_t idx = (i + 1) * stride + (j + 1);
|
||||
int_t idx = DEVICE_IDX(i, j, stride);
|
||||
|
||||
real_t u_center = u[idx];
|
||||
real_t u_up = u[idx + stride];
|
||||
real_t u_down = u[idx - stride];
|
||||
real_t u_right = u[idx + 1];
|
||||
real_t u_left = u[idx - 1];
|
||||
real_t u_center = d_u[idx];
|
||||
real_t u_up = d_u[idx + stride];
|
||||
real_t u_down = d_u[idx - stride];
|
||||
real_t u_right = d_u[idx + 1];
|
||||
real_t u_left = d_u[idx - 1];
|
||||
|
||||
real_t laplacian = u_right + u_left + u_up + u_down - 4.0 * u_center;
|
||||
u_nxt[idx] = 2.0 * u_center - u_prv[idx] + coeff * laplacian;
|
||||
d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian;
|
||||
}
|
||||
|
||||
// grid.sync();
|
||||
|
||||
int_t linear_idx = blockIdx.y * gridDim.x * blockDim.x * blockDim.y +
|
||||
blockIdx.x * blockDim.x * blockDim.y +
|
||||
threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
if (linear_idx < M) {
|
||||
int_t row_offset = (linear_idx + 1) * stride;
|
||||
u_nxt[row_offset] = u_nxt[row_offset + 2];
|
||||
u_nxt[row_offset + N + 1] = u_nxt[row_offset + N - 1];
|
||||
d_u_nxt[row_offset] = d_u_nxt[row_offset + 2];
|
||||
d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1];
|
||||
}
|
||||
|
||||
if (linear_idx < N) {
|
||||
u_nxt[linear_idx + 1] = u_nxt[2 * stride + linear_idx + 1];
|
||||
u_nxt[(M + 1) * stride + linear_idx + 1] = u_nxt[(M - 1) * stride + linear_idx + 1];
|
||||
d_u_nxt[linear_idx + 1] = d_u_nxt[2 * stride + linear_idx + 1];
|
||||
d_u_nxt[(M + 1) * stride + linear_idx + 1] = d_u_nxt[(M - 1) * stride + linear_idx + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,12 +136,12 @@ void simulate(void) {
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
|
||||
cudaMemcpyAsync(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
domain_save(0);
|
||||
|
||||
void *kernelArgs[] = {
|
||||
(void *)&buffers[0], (void *)&buffers[1], (void *)&buffers[2],
|
||||
(void *)&d_buffers[0], (void *)&d_buffers[1], (void *)&d_buffers[2],
|
||||
(void *)&M, (void *)&N, (void *)&coeff
|
||||
};
|
||||
|
||||
@@ -157,7 +155,7 @@ void simulate(void) {
|
||||
|
||||
if (iteration % snapshot_freq == 0) {
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaMemcpyAsync(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_buffer, d_buffers[1], size, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
domain_save(iteration / snapshot_freq);
|
||||
}
|
||||
@@ -228,9 +226,9 @@ void domain_initialize(void) {
|
||||
|
||||
size_t size = (M + 2) * (N + 2) * sizeof(real_t);
|
||||
|
||||
cudaMalloc(&buffers[0], size);
|
||||
cudaMalloc(&buffers[1], size);
|
||||
cudaMalloc(&buffers[2], size);
|
||||
cudaMalloc(&d_buffers[0], size);
|
||||
cudaMalloc(&d_buffers[1], size);
|
||||
cudaMalloc(&d_buffers[2], size);
|
||||
|
||||
cudaHostAlloc(&h_buffer, size, cudaHostAllocDefault);
|
||||
|
||||
@@ -238,13 +236,13 @@ void domain_initialize(void) {
|
||||
for (int_t j = 0; j < N; j++) {
|
||||
real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M +
|
||||
((j - N / 2.0) * (j - N / 2.0)) / (real_t)N);
|
||||
U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
|
||||
HOST_U(h_buffer, i, j) = exp(-4.0 * delta * delta);
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemset(buffers[2], 0, size);
|
||||
cudaMemcpy(d_buffers[0], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
|
||||
cudaMemset(d_buffers[2], 0, size);
|
||||
|
||||
dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user