ex5: improve cache locality

use row-major block partitioning for better cache locality
This commit is contained in:
2025-10-20 19:28:23 +02:00
parent 26da3f8d5e
commit 0904916ba2

View File

@@ -85,10 +85,13 @@ void domain_finalize(void) {
// Integration formula
void time_step(int_t thread_id) {
// BEGIN: T3
for (int_t i = 0; i < N; i += 1)
// let threads work each on a row for better cache locality (huge speed-up)
int_t start_row = thread_id * (N / n_threads);
// let last thread handle remainder
int_t end_row = (thread_id == n_threads - 1) ? N : (thread_id + 1) * (N / n_threads);
for (int_t i = start_row; i < end_row; i++)
for (int_t j = 0; j < N; j++)
if (j % n_threads == thread_id)
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (h * h) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (h * h) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
// END: T3
}