diff --git a/exercise6/mandel.cu b/exercise6/mandel.cu
index 1265b07..d33d9a0 100644
--- a/exercise6/mandel.cu
+++ b/exercise6/mandel.cu
@@ -1,3 +1,6 @@
+#include <cuda_device_runtime_api.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -8,8 +11,8 @@
 #define YSIZE 2048
 
 /* Divide the problem into blocks of BLOCKX x BLOCKY threads */
-#define BLOCKY 8
-#define BLOCKX 8
+#define BLOCKY 16
+#define BLOCKX 16
 
 #define MAXITER 255
 
@@ -30,7 +33,28 @@ typedef struct {
 
 /********** SUBTASK1: Create kernel device_calculate *************************/
 
-// Insert code here
+__global__ void device_calculate(double step, double xleft, double yupper, int *gpu_buf) {
+  // assumes the threads are organized in a grid, such that each thread
+  // calculates a single pixel. thus a block corresponds to a region of the
+  // output image.
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // same as inner-most part of host_calculate
+  my_complex_t c, z, temp;
+  int iter = 0;
+  c.real = (xleft + step * i);
+  c.imag = (yupper - step * j);
+  z = c;
+  while (z.real * z.real + z.imag * z.imag < 4.0) {
+    temp.real = z.real * z.real - z.imag * z.imag + c.real;
+    temp.imag = 2.0 * z.real * z.imag + c.imag;
+    z = temp;
+    if (++iter == MAXITER)
+      break;
+  }
+  gpu_buf[PIXEL(i, j)] = iter;
+}
 
 /********** SUBTASK1 END *****************************************************/
 
@@ -130,14 +154,17 @@ int main(int argc, char **argv) {
 
   /********** SUBTASK2: Set up device memory *******************************/
 
-  // Insert code here
+  int *gpu_buf;
+  cudaMalloc(&gpu_buf, sizeof(int) * XSIZE * YSIZE);
 
   /********** SUBTASK2 END *************************************************/
 
   start = walltime();
   /********** SUBTASK3: Execute the kernel on the device *******************/
 
-  // Insert code here
+  dim3 grid(XSIZE / BLOCKX, YSIZE / BLOCKY);
+  dim3 block(BLOCKX, BLOCKY);
+  device_calculate<<<grid, block>>>(step, xleft, yupper, gpu_buf);
 
   /********** SUBTASK3 END *************************************************/
 
@@ -147,7 +174,7 @@ int main(int argc, char **argv) {
 
   /********** SUBTASK4: Transfer the result from device to device_pixel[][]*/
 
-  // Insert code here
+  cudaMemcpy(device_pixel, gpu_buf, sizeof(int) * XSIZE * YSIZE, cudaMemcpyDeviceToHost);
 
   /********** SUBTASK4 END *************************************************/
 
@@ -155,7 +182,7 @@ int main(int argc, char **argv) {
 
   /********** SUBTASK5: Free the device memory also ************************/
 
-  // Insert code here
+  cudaFree(gpu_buf);
 
   /********** SUBTASK5 END *************************************************/