diff --git a/exercise7/wave_2d_parallel.cu b/exercise7/wave_2d_parallel.cu
index 8f926b4..3eb7cab 100644
--- a/exercise7/wave_2d_parallel.cu
+++ b/exercise7/wave_2d_parallel.cu
@@ -1,3 +1,4 @@
+#include <cooperative_groups.h>
 #include <cuda_runtime_api.h>
 #include <driver_types.h>
 #include <errno.h>
@@ -10,49 +11,27 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 
-// TASK: T1
-// Include the cooperative groups library
-// BEGIN: T1
-#include <cooperative_groups.h>
-// END: T1
+namespace cg = cooperative_groups;
 
-// Convert 'struct timeval' into seconds in double prec. floating point
 #define WALLTIME(t) ((double)(t).tv_sec + 1e-6 * (double)(t).tv_usec)
 
-// Option to change numerical precision
 typedef int64_t int_t;
 typedef double real_t;
 
-// TASK: T1b
-// Variables needed for implementation
-// BEGIN: T1b
-
-// Simulation parameters: size, step count, and how often to save the state
-int_t
-    N = 128,
-    M = 128,
-    max_iteration = 1000000,
-    snapshot_freq = 1000;
+int_t N = 128, M = 128, max_iteration = 1000000, snapshot_freq = 1000;
 
 #define BLOCKX 16
 #define BLOCKY 16
 
-// Wave equation parameters, time step is derived from the space step
-const real_t
-    c = 1.0,
-    dx = 1.0,
-    dy = 1.0;
-real_t
-    dt;
+const real_t c = 1.0, dx = 1.0, dy = 1.0;
+real_t dt;
 
-// Buffers for three time steps, indexed with 2 ghost points for the boundary
-real_t *buffers[3] = { NULL, NULL, NULL }; // device buffers
+real_t *buffers[3] = { NULL, NULL, NULL };
 real_t *h_buffer = NULL;
 
 #define U_prv(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
 #define U(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
 #define U_nxt(i, j) h_buffer[((i) + 1) * (N + 2) + (j) + 1]
-// END: T1b
 
 #define cudaErrorCheck(ans)               \
   {                                       \
@@ -66,7 +45,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
   }
 }
 
-// Rotate the time step buffers.
 void move_buffer_window(void) {
   real_t *temp = buffers[0];
   buffers[0] = buffers[1];
@@ -74,25 +52,19 @@ void move_buffer_window(void) {
   buffers[2] = temp;
 }
 
-// Save the present time step in a numbered file under 'data/'
 void domain_save(int_t step) {
   char filename[256];
-
-  // Ensure output directory exists (ignore error if it already exists)
   if (mkdir("data", 0755) != 0 && errno != EEXIST) {
     perror("mkdir data");
     exit(EXIT_FAILURE);
   }
-
   snprintf(filename, sizeof(filename), "data/%05" PRId64 ".dat", step);
-
   FILE *out = fopen(filename, "wb");
   if (out == NULL) {
     perror("fopen output file");
     fprintf(stderr, "Failed to open '%s' for writing.\n", filename);
     exit(EXIT_FAILURE);
   }
-
   for (int_t i = 0; i < M; ++i) {
     size_t written = fwrite(&U(i, 0), sizeof(real_t), (size_t)N, out);
     if (written != (size_t)N) {
@@ -101,116 +73,100 @@ void domain_save(int_t step) {
       exit(EXIT_FAILURE);
     }
   }
-
   if (fclose(out) != 0) {
     perror("fclose");
     exit(EXIT_FAILURE);
   }
 }
 
-// TASK: T4
-// Get rid of all the memory allocations
 void domain_finalize(void) {
-  // BEGIN: T4
   cudaFree(buffers[0]);
   cudaFree(buffers[1]);
   cudaFree(buffers[2]);
   free(h_buffer);
-  // END: T4
 }
 
-// TASK: T6
-// Neumann (reflective) boundary condition
-// BEGIN: T6
-__global__ void boundary_condition(real_t *u, int_t M, int_t N) {
-  int_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+// combined kernel for both time step and boundary condition
+__global__ void wave_equation_step(real_t *u_prv, real_t *u, real_t *u_nxt,
+                                   int_t M, int_t N, real_t c, real_t dt,
+                                   real_t dx, real_t dy) {
+  cg::grid_group grid = cg::this_grid();
+  cg::thread_block block = cg::this_thread_block();
 
-  if (idx < M) {
-    int_t i = idx;
-    u[(i + 1) * (N + 2) + 0] = u[(i + 1) * (N + 2) + 2];
-    u[(i + 1) * (N + 2) + (N + 1)] = u[(i + 1) * (N + 2) + (N - 1)];
-  }
-
-  if (idx < N) {
-    int_t j = idx;
-    u[0 * (N + 2) + (j + 1)] = u[2 * (N + 2) + (j + 1)];
-    u[(M + 1) * (N + 2) + (j + 1)] = u[(M - 1) * (N + 2) + (j + 1)];
-  }
-}
-// END: T6
-
-// TASK: T5
-// Integration formula
-// BEGIN: T5
-__global__ void time_step(real_t *u_prv, real_t *u, real_t *u_nxt,
-                          int_t M, int_t N, real_t c, real_t dt,
-                          real_t dx, real_t dy) {
   int_t i = blockIdx.y * blockDim.y + threadIdx.y;
   int_t j = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (i >= M || j >= N)
-    return;
+  // time step
+  if (i < M && j < N) {
+    int_t idx = (i + 1) * (N + 2) + (j + 1);
+    int_t idx_up = (i + 2) * (N + 2) + (j + 1);
+    int_t idx_down = (i) * (N + 2) + (j + 1);
+    int_t idx_right = (i + 1) * (N + 2) + (j + 2);
+    int_t idx_left = (i + 1) * (N + 2) + (j);
 
-  int_t idx = (i + 1) * (N + 2) + (j + 1);
-  int_t idx_up = (i + 2) * (N + 2) + (j + 1);
-  int_t idx_down = (i) * (N + 2) + (j + 1);
-  int_t idx_right = (i + 1) * (N + 2) + (j + 2);
-  int_t idx_left = (i + 1) * (N + 2) + (j);
+    real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
+    real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
 
-  real_t d2udx2 = (u[idx_right] - 2.0 * u[idx] + u[idx_left]) / (dx * dx);
-  real_t d2udy2 = (u[idx_up] - 2.0 * u[idx] + u[idx_down]) / (dy * dy);
+    u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
+  }
 
-  u_nxt[idx] = 2.0 * u[idx] - u_prv[idx] + (c * dt) * (c * dt) * (d2udx2 + d2udy2);
+  grid.sync();
+
+  int_t linear_idx = blockIdx.x * blockDim.x * blockDim.y +
+                     threadIdx.y * blockDim.x + threadIdx.x;
+
+  // boundary condition
+  if (linear_idx < M) {
+    int_t row = linear_idx;
+    u_nxt[(row + 1) * (N + 2) + 0] = u_nxt[(row + 1) * (N + 2) + 2];
+    u_nxt[(row + 1) * (N + 2) + (N + 1)] = u_nxt[(row + 1) * (N + 2) + (N - 1)];
+  }
+
+  if (linear_idx < N) {
+    int_t col = linear_idx;
+    u_nxt[0 * (N + 2) + (col + 1)] = u_nxt[2 * (N + 2) + (col + 1)];
+    u_nxt[(M + 1) * (N + 2) + (col + 1)] = u_nxt[(M - 1) * (N + 2) + (col + 1)];
+  }
 }
-// END: T5
 
-// TASK: T7
-// Main time integration.
 void simulate(void) {
-  // BEGIN: T7
   dim3 blockDim(BLOCKX, BLOCKY);
   dim3 gridDim((N + blockDim.x - 1) / blockDim.x,
                (M + blockDim.y - 1) / blockDim.y);
 
-  int_t boundary_threads = M > N ? M : N;
-  int_t boundary_blocks = (boundary_threads + BLOCKX * BLOCKY - 1) / (BLOCKX * BLOCKY);
-
-  // Save initial state
   size_t size = (M + 2) * (N + 2) * sizeof(real_t);
   cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost);
   domain_save(0);
 
-  // Go through each time step
+  void *kernelArgs[] = {
+    (void *)&buffers[0], (void *)&buffers[1], (void *)&buffers[2],
+    (void *)&M, (void *)&N, (void *)&c, (void *)&dt, (void *)&dx, (void *)&dy
+  };
+
   for (int_t iteration = 1; iteration <= max_iteration; iteration++) {
-    time_step<<<gridDim, blockDim>>>(buffers[0], buffers[1], buffers[2],
-                                     M, N, c, dt, dx, dy);
+    cudaLaunchCooperativeKernel(
+        (void *)wave_equation_step,
+        gridDim, blockDim,
+        kernelArgs);
 
-    boundary_condition<<<boundary_blocks, BLOCKX * BLOCKY>>>(buffers[2], M, N);
-
-    cudaDeviceSynchronize();
+    cudaErrorCheck(cudaGetLastError());
+    cudaErrorCheck(cudaDeviceSynchronize());
 
     move_buffer_window();
 
-    // Save snapshots at specified frequency
     if (iteration % snapshot_freq == 0) {
       cudaMemcpy(h_buffer, buffers[1], size, cudaMemcpyDeviceToHost);
       domain_save(iteration / snapshot_freq);
     }
   }
-  // END: T7
 }
 
-// TASK: T8
-// GPU occupancy
 void occupancy(void) {
-  // BEGIN: T8
   cudaDeviceProp p;
   cudaGetDeviceProperties(&p, 0);
 
   dim3 blockDim(BLOCKX, BLOCKY);
   int threads_per_block = blockDim.x * blockDim.y;
-
-  // Calculate grid dimensions
   dim3 gridDim((N + BLOCKX - 1) / BLOCKX, (N + BLOCKY - 1) / BLOCKY);
 
   printf("Grid size set to:             (%d, %d)\n", gridDim.x, gridDim.y);
@@ -222,22 +178,16 @@ void occupancy(void) {
   int active_warps = max_blocks_per_sm * warps_per_block;
 
   real_t occupancy_ratio = (real_t)active_warps / (real_t)max_warps_per_sm;
-
   if (occupancy_ratio > 1.0)
     occupancy_ratio = 1.0;
 
   printf("Theoretical occupancy:        %.6f\n", occupancy_ratio);
-  // END: T8
 }
 
-// TASK: T2
-// Make sure at least one CUDA-capable device exists
 static bool init_cuda() {
-  // BEGIN: T2
   int count;
   if (cudaGetDeviceCount(&count) != cudaSuccess)
     return false;
-
   printf("CUDA device count: %d\n", count);
 
   if (count > 0) {
@@ -247,6 +197,12 @@ static bool init_cuda() {
     if (cudaGetDeviceProperties(&p, 0) != cudaSuccess)
       return false;
 
+    // Check cooperative launch support
+    if (!p.cooperativeLaunch) {
+      fprintf(stderr, "Device does not support cooperative kernel launch!\n");
+      return false;
+    }
+
     printf("CUDA device #0:\n");
     printf("        Name: %s\n", p.name);
     printf("        Compute capability: %d.%d\n", p.major, p.minor);
@@ -255,21 +211,15 @@ static bool init_cuda() {
     printf("        Global memory: %.1fGiB bytes\n", p.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
     printf("        Per-block shared memory: %.1fKiB\n", p.sharedMemPerBlock / 1024.0);
     printf("        Per-block registers: %d\n", p.regsPerBlock);
+    printf("        Cooperative launch: %s\n", p.cooperativeLaunch ? "YES" : "NO");
   }
   return true;
-  // END: T2
 }
 
-// TASK: T3
-// Set up our three buffers, and fill two with an initial perturbation
-// Function to determine occupancy and optimal configuration
-
 void domain_initialize(void) {
-  // BEGIN: T3
   bool locate_cuda = init_cuda();
-  if (!locate_cuda) {
+  if (!locate_cuda)
     exit(EXIT_FAILURE);
-  }
 
   size_t size = (M + 2) * (N + 2) * sizeof(real_t);
 
@@ -281,7 +231,6 @@ void domain_initialize(void) {
 
   for (int_t i = 0; i < M; i++) {
     for (int_t j = 0; j < N; j++) {
-      // Calculate delta (radial distance) adjusted for M x N grid
       real_t delta = sqrt(((i - M / 2.0) * (i - M / 2.0)) / (real_t)M +
                           ((j - N / 2.0) * (j - N / 2.0)) / (real_t)N);
       U_prv(i, j) = U(i, j) = exp(-4.0 * delta * delta);
@@ -292,26 +241,20 @@ void domain_initialize(void) {
   cudaMemcpy(buffers[1], h_buffer, size, cudaMemcpyHostToDevice);
   cudaMemset(buffers[2], 0, size);
 
-  // Set the time step for 2D case
   dt = dx * dy / (c * sqrt(dx * dx + dy * dy));
 }
 
 int main(void) {
-  // Set up the initial state of the domain
   domain_initialize();
 
   struct timeval t_start, t_end;
-
   gettimeofday(&t_start, NULL);
   simulate();
   gettimeofday(&t_end, NULL);
 
-  printf("Total elapsed time: %lf seconds\n",
-         WALLTIME(t_end) - WALLTIME(t_start));
-
+  printf("Total elapsed time: %lf seconds\n", WALLTIME(t_end) - WALLTIME(t_start));
   occupancy();
 
-  // Clean up and shut down
   domain_finalize();
   exit(EXIT_SUCCESS);
 }