ex7: handle uneven sizes (27*28)

This commit is contained in:
2025-11-04 16:18:03 +01:00
parent dc8242d7f4
commit 46ffa95912

View File

@@ -109,19 +109,22 @@ __global__ void wave_equation_step(real_t *__restrict__ d_u_prv,
d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian;
}
int_t linear_idx = blockIdx.y * gridDim.x * blockDim.x * blockDim.y +
blockIdx.x * blockDim.x * blockDim.y +
threadIdx.y * blockDim.x + threadIdx.x;
grid.sync();
if (linear_idx < M) {
int_t row_offset = (linear_idx + 1) * stride;
int_t global_thread_idx = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) +
threadIdx.y * blockDim.x + threadIdx.x;
if (global_thread_idx < M) {
int_t row = global_thread_idx;
int_t row_offset = (row + 1) * stride;
d_u_nxt[row_offset] = d_u_nxt[row_offset + 2];
d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1];
}
if (linear_idx < N) {
d_u_nxt[linear_idx + 1] = d_u_nxt[2 * stride + linear_idx + 1];
d_u_nxt[(M + 1) * stride + linear_idx + 1] = d_u_nxt[(M - 1) * stride + linear_idx + 1];
if (global_thread_idx < N) {
int_t col = global_thread_idx;
d_u_nxt[col + 1] = d_u_nxt[2 * stride + col + 1];
d_u_nxt[(M + 1) * stride + col + 1] = d_u_nxt[(M - 1) * stride + col + 1];
}
}
@@ -176,14 +179,17 @@ void occupancy(void) {
printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y);
printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY);
int warps_per_block = (threads_per_block + 31) / 32;
int max_warps_per_sm = p.maxThreadsPerMultiProcessor / 32;
int max_blocks_per_sm = p.maxThreadsPerMultiProcessor / threads_per_block;
int active_warps = max_blocks_per_sm * warps_per_block;
int numBlocksPerSm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm,
wave_equation_step,
threads_per_block,
0);
real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
if (occupancy_ratio > 1.0)
occupancy_ratio = 1.0;
int activeWarps = numBlocksPerSm * ((threads_per_block + 31) / 32);
int maxWarps = p.maxThreadsPerMultiProcessor / 32;
real_t occupancy_ratio = (real_t)activeWarps / (real_t)maxWarps;
printf("Theoretical occupancy: %.6f\n", occupancy_ratio);
}