ex7: handle uneven sizes (27*28)
This commit is contained in:
@@ -109,19 +109,22 @@ __global__ void wave_equation_step(real_t *__restrict__ d_u_prv,
|
||||
d_u_nxt[idx] = 2.0 * u_center - d_u_prv[idx] + coeff * laplacian;
|
||||
}
|
||||
|
||||
int_t linear_idx = blockIdx.y * gridDim.x * blockDim.x * blockDim.y +
|
||||
blockIdx.x * blockDim.x * blockDim.y +
|
||||
threadIdx.y * blockDim.x + threadIdx.x;
|
||||
grid.sync();
|
||||
|
||||
if (linear_idx < M) {
|
||||
int_t row_offset = (linear_idx + 1) * stride;
|
||||
int_t global_thread_idx = (blockIdx.y * gridDim.x + blockIdx.x) * (blockDim.x * blockDim.y) +
|
||||
threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
if (global_thread_idx < M) {
|
||||
int_t row = global_thread_idx;
|
||||
int_t row_offset = (row + 1) * stride;
|
||||
d_u_nxt[row_offset] = d_u_nxt[row_offset + 2];
|
||||
d_u_nxt[row_offset + N + 1] = d_u_nxt[row_offset + N - 1];
|
||||
}
|
||||
|
||||
if (linear_idx < N) {
|
||||
d_u_nxt[linear_idx + 1] = d_u_nxt[2 * stride + linear_idx + 1];
|
||||
d_u_nxt[(M + 1) * stride + linear_idx + 1] = d_u_nxt[(M - 1) * stride + linear_idx + 1];
|
||||
if (global_thread_idx < N) {
|
||||
int_t col = global_thread_idx;
|
||||
d_u_nxt[col + 1] = d_u_nxt[2 * stride + col + 1];
|
||||
d_u_nxt[(M + 1) * stride + col + 1] = d_u_nxt[(M - 1) * stride + col + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,14 +179,17 @@ void occupancy(void) {
|
||||
printf("Grid size set to: (%d, %d)\n", gridDim.x, gridDim.y);
|
||||
printf("Launched blocks of size: (%d, %d)\n", BLOCKX, BLOCKY);
|
||||
|
||||
int warps_per_block = (threads_per_block + 31) / 32;
|
||||
int max_warps_per_sm = p.maxThreadsPerMultiProcessor / 32;
|
||||
int max_blocks_per_sm = p.maxThreadsPerMultiProcessor / threads_per_block;
|
||||
int active_warps = max_blocks_per_sm * warps_per_block;
|
||||
int numBlocksPerSm = 0;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocksPerSm,
|
||||
wave_equation_step,
|
||||
threads_per_block,
|
||||
0);
|
||||
|
||||
real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
|
||||
if (occupancy_ratio > 1.0)
|
||||
occupancy_ratio = 1.0;
|
||||
int activeWarps = numBlocksPerSm * ((threads_per_block + 31) / 32);
|
||||
int maxWarps = p.maxThreadsPerMultiProcessor / 32;
|
||||
|
||||
real_t occupancy_ratio = (real_t)activeWarps / (real_t)maxWarps;
|
||||
|
||||
printf("Theoretical occupancy: %.6f\n", occupancy_ratio);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user