diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu index ce2fbc6..d35f706 100644 --- a/exercise7/wave_2d_parallel.cu +++ b/exercise7/wave_2d_parallel.cu @@ -109,19 +109,22 @@ __global__ void wave_equation_step(real_t *__restrict__ d_u_prv, d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian; } - int_t linear_idx = blockIdx.y * gridDim.x * blockDim.x * blockDim.y + - blockIdx.x * blockDim.x * blockDim.y + - threadIdx.y * blockDim.x + threadIdx.x; + grid.sync(); - if (linear_idx < M) { - int_t row_offset = (linear_idx + 1) * stride; + int_t global_thread_idx = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) + + threadIdx.y * blockDim.x + threadIdx.x; + + if (global_thread_idx < M) { + int_t row = global_thread_idx; + int_t row_offset = (row + 1) * stride; d_u_nxt[row_offset] = d_u_nxt[row_offset + 2]; d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1]; } - if (linear_idx < N) { - d_u_nxt[linear_idx + 1] = d_u_nxt[2 * stride + linear_idx + 1]; - d_u_nxt[(M + 1) * stride + linear_idx + 1] = d_u_nxt[(M - 1) * stride + linear_idx + 1]; + if (global_thread_idx < N) { + int_t col = global_thread_idx; + d_u_nxt[col + 1] = d_u_nxt[2 * stride + col + 1]; + d_u_nxt[(M + 1) * stride + col + 1] = d_u_nxt[(M - 1) * stride + col + 1]; } } @@ -176,14 +179,17 @@ void occupancy(void) { printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y); printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY); - int warps_per_block = (threads_per_block + 31) / 32; - int max_warps_per_sm = p.maxThreadsPerMultiProcessor / 32; - int max_blocks_per_sm = p.maxThreadsPerMultiProcessor / threads_per_block; - int active_warps = max_blocks_per_sm * warps_per_block; + int numBlocksPerSm = 0; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, + wave_equation_step, + threads_per_block, + 0); - real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm; - if (occupancy_ratio > 1.0) - occupancy_ratio = 1.0; + int activeWarps = numBlocksPerSm * ((threads_per_block + 31) / 32); + int maxWarps = p.maxThreadsPerMultiProcessor / 32; + + real_t occupancy_ratio = (real_t)activeWarps / (real_t)maxWarps; printf("Theoretical occupancy: %.6f\n", occupancy_ratio); }