ex6: finish

2025-10-25 18:07:49 +02:00
parent 919e9ce992
commit 42cf2f9592
1 changed files with 34 additions and 7 deletions
--- a/exercise6/mandel.cu
+++ b/exercise6/mandel.cu
@@ -1,3 +1,6 @@
+#include <cuda_device_runtime_api.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -8,8 +11,8 @@
 #define YSIZE 2048

 /* Divide the problem into blocks of BLOCKX x BLOCKY threads */
-#define BLOCKY 8
-#define BLOCKX 8
+#define BLOCKY 16
+#define BLOCKX 16

 #define MAXITER 255

@@ -30,7 +33,28 @@ typedef struct {

 /********** SUBTASK1: Create kernel device_calculate *************************/

-// Insert code here
+__global__ void device_calculate(double step, double xleft, double yupper, int *gpu_buf) {
+  // assumes the threads are organized in a grid, such that each thread
+  // calculates a single pixel. thus a block corresponds to a region of the
+  // output image.
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // same as inner-most part of host_calculate
+  my_complex_t c, z, temp;
+  int iter = 0;
+  c.real = (xleft + step * i);
+  c.imag = (yupper - step * j);
+  z = c;
+  while (z.real * z.real + z.imag * z.imag < 4.0) {
+    temp.real = z.real * z.real - z.imag * z.imag + c.real;
+    temp.imag = 2.0 * z.real * z.imag + c.imag;
+    z = temp;
+    if (++iter == MAXITER)
+      break;
+  }
+  gpu_buf[PIXEL(i, j)] = iter;
+}

 /********** SUBTASK1 END *****************************************************/

@@ -130,14 +154,17 @@ int main(int argc, char **argv) {

  /********** SUBTASK2: Set up device memory *******************************/

-  // Insert code here
+  int *gpu_buf;
+  cudaMalloc(&gpu_buf, sizeof(int) * XSIZE * YSIZE);

  /********** SUBTASK2 END *************************************************/

  start = walltime();
  /********** SUBTASK3: Execute the kernel on the device *******************/

-  // Insert code here
+  dim3 grid(XSIZE / BLOCKX, YSIZE / BLOCKY);
+  dim3 block(BLOCKX, BLOCKY);
+  device_calculate<<<grid, block>>>(step, xleft, yupper, gpu_buf);

  /********** SUBTASK3 END *************************************************/

@@ -147,7 +174,7 @@ int main(int argc, char **argv) {

  /********** SUBTASK4: Transfer the result from device to device_pixel[][]*/

-  // Insert code here
+  cudaMemcpy(device_pixel, gpu_buf, sizeof(int) * XSIZE * YSIZE, cudaMemcpyDeviceToHost);

  /********** SUBTASK4 END *************************************************/

@@ -155,7 +182,7 @@ int main(int argc, char **argv) {

  /********** SUBTASK5: Free the device memory also ************************/

-  // Insert code here
+  cudaFree(gpu_buf);

  /********** SUBTASK5 END *************************************************/