ex5: improve cache locality
use row-major block partitioning for better cache locality
This commit is contained in:
@@ -85,10 +85,13 @@ void domain_finalize(void) {
|
||||
// Integration formula
|
||||
void time_step(int_t thread_id) {
|
||||
// BEGIN: T3
|
||||
for (int_t i = 0; i < N; i += 1)
|
||||
// let threads work each on a row for better cache locality (huge speed-up)
|
||||
int_t start_row = thread_id * (N / n_threads);
|
||||
// let last thread handle remainder
|
||||
int_t end_row = (thread_id == n_threads - 1) ? N : (thread_id + 1) * (N / n_threads);
|
||||
for (int_t i = start_row; i < end_row; i++)
|
||||
for (int_t j = 0; j < N; j++)
|
||||
if (j % n_threads == thread_id)
|
||||
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (h * h) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
|
||||
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (h * h) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
|
||||
// END: T3
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user