#define _XOPEN_SOURCE 600
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>

#include "argument_utils.h"

// TASK: T1a
// Include the MPI headerfile
// BEGIN: T1
#include <mpi.h>
// END: T1a

// Convert 'struct timeval' into seconds in double prec. floating point
#define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)

// Option to change numerical precision
typedef int64_t int_t;
typedef double real_t;

// Buffers for three time steps, indexed with 2 ghost points for the boundary
real_t
    *buffers[3] = { NULL, NULL, NULL };

// TASK: T1b
// Declare variables each MPI process will need
// BEGIN: T1b
#define U_prv(i, j) buffers[0][((i) + 1) * (local_N + 2) + (j) + 1]
#define U(i, j) buffers[1][((i) + 1) * (local_N + 2) + (j) + 1]
#define U_nxt(i, j) buffers[2][((i) + 1) * (local_N + 2) + (j) + 1]

int world_size;
int world_rank;

#define ROOT 0

int local_M, local_N;

MPI_Comm cartesian_comm;
int coordinates[2], dims[2] = { 0 };
int cartesian_rank;

// END: T1b

// Simulation parameters: size, step count, and how often to save the state
int_t
    M = 256, // rows
    N = 256, // cols
    max_iteration = 4000,
    snapshot_freq = 20;

// Wave equation parameters, time step is derived from the space step
const real_t
    c = 1.0,
    dx = 1.0,
    dy = 1.0;
real_t
    dt;

// Rotate the time step buffers.
void move_buffer_window(void) {
  real_t *temp = buffers[0];
  buffers[0] = buffers[1];
  buffers[1] = buffers[2];
  buffers[2] = temp;
}

// TASK: T8
// Save the present time step in a numbered file under 'data/'
void domain_save(int_t step) {
  // BEGIN: T8
  char filename[256];

  // Ensure output directory exists (ignore error if it already exists)
  if (mkdir("data", 0755) != 0 && errno != EEXIST) {
    perror("mkdir data");
    exit(EXIT_FAILURE);
  }

  snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step);

  FILE *out = fopen(filename, "wb");
  if (out == NULL) {
    perror("fopen output file");
    fprintf(stderr, "Failed to open '%s' for writing.\n", filename);
    exit(EXIT_FAILURE);
  }

  for (int_t i = 0; i < M; ++i) {
    size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out);
    if (written != (size_t)N) {
      perror("fwrite");
      fclose(out);
      exit(EXIT_FAILURE);
    }
  }

  if (fclose(out) != 0) {
    perror("fclose");
    exit(EXIT_FAILURE);
  }
  // END: T8
}

// TASK: T7
// Neumann (reflective) boundary condition
void boundary_condition(void) {
  // BEGIN: T7
  // Vertical ghost layers (bottom and top)
  for (int_t i = 0; i < M; i++) {
    U(i, -1) = U(i, 1);    // bottom ghost row <- mirror of row 1
    U(i, N) = U(i, N - 2); // top ghost row <- mirror of row N-2
  }

  // Horizontal ghost layers (left and right)
  for (int_t j = 0; j < N; j++) {
    U(-1, j) = U(1, j);    // left ghost col <- mirror of col 1
    U(M, j) = U(M - 2, j); // right ghost col <- mirror of col M-2
  }

  // Corner ghost cells (use ghost indices)
  U(-1, -1) = U(1, 1);       // bottom-left
  U(-1, N) = U(1, N - 2);    // top-left
  U(M, -1) = U(M - 2, 1);    // bottom-right
  U(M, N) = U(M - 2, N - 2); // top-right
  // END: T7
}

// TASK: T4
// Set up our three buffers, and fill two with an initial perturbation
// and set the time step.
void domain_initialize(void) {
  // BEGIN: T4
  buffers[0] = malloc((local_M + 2) * (local_N + 2) * sizeof(real_t));
  buffers[1] = malloc((local_M + 2) * (local_N + 2) * sizeof(real_t));
  buffers[2] = malloc((local_M + 2) * (local_N + 2) * sizeof(real_t));

  if (!buffers[0] || !buffers[1] || !buffers[2]) {
    perror("malloc");
    exit(EXIT_FAILURE);
  }

  // initialize interior (physical cells)
  int rows_offset = coordinates[0] * local_M;
  int cols_offset = coordinates[1] * local_N;
  for (int_t i = 0; i < local_M; i++) {
    for (int_t j = 0; j < local_N; j++) {
      real_t dx_i = (i + rows_offset - M / 2.0);
      real_t dy_j = (j + cols_offset - N / 2.0);
      real_t delta = sqrt((dx_i * dx_i) / (real_t)M + (dy_j * dy_j) / (real_t)N);
      U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
    }
  }

  // Set the time step (CFL) with a safety factor
  dt = 1.0 / (c * sqrt(1.0 / (dx * dx) + 1.0 / (dy * dy)));
  dt *= 0.9; // safety factor

  // Fill ghost cells once so they are valid before the first step
  boundary_condition();
  // END: T4
}

// Get rid of all the memory allocations
void domain_finalize(void) {
  free(buffers[0]);
  free(buffers[1]);
  free(buffers[2]);
}

// TASK: T5
// Integration formula
void time_step(void) {
  // BEGIN: T5
  double coef_x = (c * dt) * (c * dt) / (dx * dx);
  double coef_y = (c * dt) * (c * dt) / (dy * dy);
  double coef_xy = (c * dt) * (c * dt) / (6.0 * dx * dy);

  // Only iterate over physical domain cells i = 0..M-1, j = 0..N-1
  for (int i = 0; i < local_M; i++) {
    for (int j = 0; j < local_N; j++) {
      U_nxt(i, j) = 2.0 * U(i, j) - U_prv(i, j) + coef_x * (U(i + 1, j) - 2.0 * U(i, j) + U(i - 1, j)) + coef_y * (U(i, j + 1) - 2.0 * U(i, j) + U(i, j - 1)) + coef_xy * (U(i + 1, j + 1) + U(i + 1, j - 1) + U(i - 1, j + 1) + U(i - 1, j - 1) - 4.0 * U(i, j));
    }
  }
  // END: T5
}

void get_corner_procs(void) {
  int neighbor_coords[2];

  neighbor_coords[0] = coordinates[0] - 1;
  neighbor_coords[1] = coordinates[1] - 1;
  if (neighbor_coords[0] >= 0 && neighbor_coords[1] >= 0) {
    MPI_Cart_rank(cartesian_comm, neighbor_coords, &up_left);
  } else {
    up_left = MPI_PROC_NULL;
  }

  neighbor_coords[0] = coordinates[0] - 1;
  neighbor_coords[1] = coordinates[1] + 1;
  if (neighbor_coords[0] >= 0 && neighbor_coords[1] < dims[1]) {
    MPI_Cart_rank(cartesian_comm, neighbor_coords, &up_right);
  } else {
    up_right = MPI_PROC_NULL;
  }

  neighbor_coords[0] = coordinates[0] + 1;
  neighbor_coords[1] = coordinates[1] - 1;
  if (neighbor_coords[0] < dims[0] && neighbor_coords[1] >= 0) {
    MPI_Cart_rank(cartesian_comm, neighbor_coords, &down_left);
  } else {
    down_left = MPI_PROC_NULL;
  }

  neighbor_coords[0] = coordinates[0] + 1;
  neighbor_coords[1] = coordinates[1] + 1;
  if (neighbor_coords[0] < dims[0] && neighbor_coords[1] < dims[1]) {
    MPI_Cart_rank(cartesian_comm, neighbor_coords, &down_right);
  } else {
    down_right = MPI_PROC_NULL;
  }
}

// TASK: T6
// Communicate the border between processes.
void border_exchange(void) {
  // BEGIN: T6
  ;
  // END: T6
}

// Main time integration.
void simulate(void) {
  // Go through each time step
  for (int_t iteration = 0; iteration <= max_iteration; iteration++) {
    if ((iteration % snapshot_freq) == 0) {
      domain_save(iteration / snapshot_freq);
    }

    // Derive step t+1 from steps t and t-1
    border_exchange();
    boundary_condition();
    time_step();

    // Rotate the time step buffers
    move_buffer_window();
  }
}

int main(int argc, char **argv) {
  // TASK: T1c
  // Initialise MPI
  // BEGIN: T1c
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  // END: T1c

  // TASK: T3
  // Distribute the user arguments to all the processes
  // BEGIN: T3
  // create options type for easy broadcast
  MPI_Datatype options_type;
  MPI_Type_contiguous(4, MPI_INT64_T, &options_type);
  MPI_Type_commit(&options_type);

  OPTIONS options;
  if (world_rank == ROOT) {
    OPTIONS *parsed = parse_args(argc, argv);
    if (!parsed) {
      fprintf(stderr, "Argument parsing failed\n");
      exit(EXIT_FAILURE);
    }
    options = *parsed;
    free(parsed);
  }

  MPI_Bcast(&options, 1, options_type, ROOT, MPI_COMM_WORLD);
  MPI_Type_free(&options_type);

  // read options
  M = options.M;
  N = options.N;
  max_iteration = options.max_iteration;
  snapshot_freq = options.snapshot_frequency;

  // grid dimensions
  MPI_Dims_create(world_size, 2, dims);

  local_M = M / dims[0];
  local_N = N / dims[1];

  // create cartesian communicator
  int periodicity[2] = { 0, 0 };
  MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periodicity, 1, &cartesian_comm);

  // fetch cartesian coords
  MPI_Comm_rank(cartesian_comm, &cartesian_rank);
  MPI_Cart_coords(cartesian_comm, cartesian_rank, 2, coordinates);
  MPI_Cart_shift(cartesian_comm, 0, 1, &up, &down);
  MPI_Cart_shift(cartesian_comm, 1, 1, &left, &right);
  // END: T3

  // Set up the initial state of the domain
  domain_initialize();

  struct timeval t_start, t_end;

  // TASK: T2
  // Time your code
  // BEGIN: T2
  if (world_rank == ROOT)
    gettimeofday(&t_start, NULL);

  simulate();
  // MPI_Barrier(MPI_COMM_WORLD); // TODO

  if (world_rank == ROOT) {
    gettimeofday(&t_end, NULL);

    printf("Total elapsed time: %lf seconds\n",
           WALLTIME(t_end) - WALLTIME(t_start));
  }
  // END: T2

  // Clean up and shut down
  domain_finalize();

  // TASK: T1d
  // Finalise MPI
  // BEGIN: T1d
  MPI_Comm_free(&cartesian_comm);
  MPI_Finalize();
  // END: T1d

  exit(EXIT_SUCCESS);
}