Files
TDT4200/exercise5/handout_pthreads/wave_2d_pthread.c
fredrikr79 0904916ba2 ex5: improve cache locality
use row-major block partitioning for better cache locality
2025-10-20 19:58:53 +02:00

209 lines
5.2 KiB
C

#define _XOPEN_SOURCE 600
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
// TASK: T1a
// Include the pthreads library
// BEGIN: T1a
#include <pthread.h>
// END: T1a
// Option to change numerical precision
typedef int64_t int_t;
typedef double real_t;
// TASK: T1b
// Pthread management
// BEGIN: T1b
int_t n_threads = 4;
pthread_barrier_t barrier;
// END: T1b
// Performance measurement
struct timeval t_start, t_end;
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)
// Simulation parameters: size, step count, and how often to save the state
const int_t
N = 1024,
max_iteration = 4000,
snapshot_freq = 20;
// Wave equation parameters, time step is derived from the space step
const real_t
c = 1.0,
h = 1.0;
real_t
dt;
// Buffers for three time steps, indexed with 2 ghost points for the boundary
real_t
*buffers[3] = { NULL, NULL, NULL };
#define U_prv(i, j) buffers[0][((i) + 1) * (N + 2) + (j) + 1]
#define U(i, j) buffers[1][((i) + 1) * (N + 2) + (j) + 1]
#define U_nxt(i, j) buffers[2][((i) + 1) * (N + 2) + (j) + 1]
// Rotate the time step buffers.
void move_buffer_window(void) {
real_t *temp = buffers[0];
buffers[0] = buffers[1];
buffers[1] = buffers[2];
buffers[2] = temp;
}
// Set up our three buffers, and fill two with an initial perturbation
void domain_initialize(void) {
buffers[0] = malloc((N + 2) * (N + 2) * sizeof(real_t));
buffers[1] = malloc((N + 2) * (N + 2) * sizeof(real_t));
buffers[2] = malloc((N + 2) * (N + 2) * sizeof(real_t));
for (int_t i = 0; i < N; i++) {
for (int_t j = 0; j < N; j++) {
real_t delta = sqrt(((i - N / 2) * (i - N / 2) + (j - N / 2) * (j - N / 2)) / (real_t)N);
U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
}
}
// Set the time step
dt = (h * h) / (4.0 * c * c);
}
// Get rid of all the memory allocations
void domain_finalize(void) {
free(buffers[0]);
free(buffers[1]);
free(buffers[2]);
}
// TASK: T3
// Integration formula
void time_step(int_t thread_id) {
// BEGIN: T3
// let threads work each on a row for better cache locality (huge speed-up)
int_t start_row = thread_id * (N / n_threads);
// let last thread handle remainder
int_t end_row = (thread_id == n_threads - 1) ? N : (thread_id + 1) * (N / n_threads);
for (int_t i = start_row; i < end_row; i++)
for (int_t j = 0; j < N; j++)
U_nxt(i, j) = -U_prv(i, j) + 2.0 * U(i, j) + (dt * dt * c * c) / (h * h) * (U(i - 1, j) + U(i + 1, j) + U(i, j - 1) + U(i, j + 1) - 4.0 * U(i, j));
// END: T3
}
// TASK: T4
// Neumann (reflective) boundary condition
void boundary_condition(int_t thread_id) {
// BEGIN: T4
for (int_t i = 0; i < N; i += 1) {
if (i % n_threads != thread_id)
continue;
U(i, -1) = U(i, 1);
U(i, N) = U(i, N - 2);
}
for (int_t j = 0; j < N; j += 1) {
if (j % n_threads != thread_id)
continue;
U(-1, j) = U(1, j);
U(N, j) = U(N - 2, j);
}
// END: T4
}
// Save the present time step in a numbered file under 'data/'
void domain_save(int_t step) {
char filename[256];
sprintf(filename, "data/%.5ld.dat", step);
FILE *out = fopen(filename, "wb");
for (int_t i = 0; i < N; i++)
fwrite(&U(i, 0), sizeof(real_t), N, out);
fclose(out);
}
// TASK: T5
// Main loop
void *simulate(void *id) {
// BEGIN: T5
// Go through each time step
for (int_t iteration = 0; iteration <= max_iteration; iteration++) {
int_t tid = *((int_t *)id);
if (tid == 0)
if ((iteration % snapshot_freq) == 0)
domain_save(iteration / snapshot_freq);
// Derive step t+1 from steps t and t-1
boundary_condition(tid);
pthread_barrier_wait(&barrier);
time_step(tid);
pthread_barrier_wait(&barrier);
// Rotate the time step buffers
if (tid == 0)
move_buffer_window();
pthread_barrier_wait(&barrier);
}
// END: T5
}
// Main time integration loop
int main(int argc, char **argv) {
// Number of threads is an optional argument, sanity check its value
if (argc > 1) {
n_threads = strtol(argv[1], NULL, 10);
if (errno == EINVAL)
fprintf(stderr, "'%s' is not a valid thread count\n", argv[1]);
if (n_threads < 1) {
fprintf(stderr, "Number of threads must be >0\n");
exit(EXIT_FAILURE);
}
}
// TASK: T1c
// Initialise pthreads
// BEGIN: T1c
pthread_barrier_init(&barrier, NULL, n_threads);
// END: T1b
// Set up the initial state of the domain
domain_initialize();
// Time the execution
gettimeofday(&t_start, NULL);
// TASK: T2
// Run the integration loop
// BEGIN: T2
pthread_t threads[n_threads];
int_t tids[n_threads];
for (int i = 0; i < n_threads; i++) {
tids[i] = i;
pthread_create(&threads[tids[i]], NULL, &simulate, &tids[i]);
}
for (int i = 0; i < n_threads; i++)
pthread_join(threads[tids[i]], NULL);
// END: T2
// Report how long we spent in the integration stage
gettimeofday(&t_end, NULL);
printf("%lf seconds elapsed with %ld threads\n",
WALLTIME(t_end) - WALLTIME(t_start),
n_threads);
// Clean up and shut down
domain_finalize();
// TASK: T1d
// Finalise pthreads
// BEGIN: T1d
pthread_barrier_destroy(&barrier);
// END: T1d
exit(EXIT_SUCCESS);
}