diff --git a/exercise6/mandel.cu b/exercise6/mandel.cu index 1265b07..d33d9a0 100644 --- a/exercise6/mandel.cu +++ b/exercise6/mandel.cu @@ -1,3 +1,6 @@ +#include +#include +#include #include #include #include @@ -8,8 +11,8 @@ #define YSIZE 2048 /* Divide the problem into blocks of BLOCKX x BLOCKY threads */ -#define BLOCKY 8 -#define BLOCKX 8 +#define BLOCKY 16 +#define BLOCKX 16 #define MAXITER 255 @@ -30,7 +33,28 @@ typedef struct { /********** SUBTASK1: Create kernel device_calculate *************************/ -// Insert code here +__global__ void device_calculate(double step, double xleft, double yupper, int *gpu_buf) { + // assumes the threads are organized in a grid, such that each thread + // calculates a single pixel. thus a block corresponds to a region of the + // output image. + int i = threadIdx.x + blockIdx.x * blockDim.x; + int j = threadIdx.y + blockIdx.y * blockDim.y; + + // same as inner-most part of host_calculate + my_complex_t c, z, temp; + int iter = 0; + c.real = (xleft + step * i); + c.imag = (yupper - step * j); + z = c; + while (z.real * z.real + z.imag * z.imag < 4.0) { + temp.real = z.real * z.real - z.imag * z.imag + c.real; + temp.imag = 2.0 * z.real * z.imag + c.imag; + z = temp; + if (++iter == MAXITER) + break; + } + gpu_buf[PIXEL(i, j)] = iter; +} /********** SUBTASK1 END *****************************************************/ @@ -130,14 +154,17 @@ int main(int argc, char **argv) { /********** SUBTASK2: Set up device memory *******************************/ - // Insert code here + int *gpu_buf; + cudaMalloc(&gpu_buf, sizeof(int) * XSIZE * YSIZE); /********** SUBTASK2 END *************************************************/ start = walltime(); /********** SUBTASK3: Execute the kernel on the device *******************/ - // Insert code here + dim3 grid(XSIZE / BLOCKX, YSIZE / BLOCKY); + dim3 block(BLOCKX, BLOCKY); + device_calculate<<>>(step, xleft, yupper, gpu_buf); /********** SUBTASK3 END *************************************************/ @@ -147,7 +174,7 @@ int main(int argc, char **argv) { /********** SUBTASK4: Transfer the result from device to device_pixel[][]*/ - // Insert code here + cudaMemcpy(device_pixel, gpu_buf, sizeof(int) * XSIZE * YSIZE, cudaMemcpyDeviceToHost); /********** SUBTASK4 END *************************************************/ @@ -155,7 +182,7 @@ int main(int argc, char **argv) { /********** SUBTASK5: Free the device memory also ************************/ - // Insert code here + cudaFree(gpu_buf); /********** SUBTASK5 END *************************************************/