#include "../../CUDA.au3"
#include <WinAPIEx.au3>
#include <GDIPlus.au3>

; Some parameters
Global $Range = [-2, 2, -2, 2] ; XY range
Global $iWndSize = 768 ; Window size

$hGUI = GUICreate("Julia Fractal - now with CUDA! [click to zoom in]", $iWndSize, $iWndSize)
$cidClick = GUICtrlCreateLabel("", 0, 0, $iWndSize, $iWndSize)
GUISetState()

; Initialize the CUDA Driver API
_CUDA_Startup()
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load nvcuda.dll")
_CUDA_Init()
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't init CUDA.")

; Set up the device & create a context
$hDevice = _CUDA_DeviceGet(0)
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load the device")
$hContext = _CUDA_CtxCreate($hDevice)

; Load the module and function
$hModule = _CUDA_ModuleLoad("julia.cubin")
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load julia.cubin")
$hFunction = _CUDA_ModuleGetFunction($hModule, "julia")

; Allocate memory on the GPU
$pDev_Range = _CUDA_MemAlloc(DllStructGetSize(DllStructCreate("float[4]")))
_PrepRange()

; Set up parameters
_CUDA_FuncSetBlockShape($hFunction, 1, 1, 1)
_CUDA_FuncSetSharedSize($hFunction, 12)
_CUDA_ParamSetSize($hFunction, 12)
_CUDA_ParamSetv($hFunction, 0, $pDev_Range, 4) ; Ptr to range struct
_CUDA_ParamSeti($hFunction, 4, 300) ; Iteration count

; Set up GDI+
_GDIPlus_Startup()
$hGpx = _GDIPlus_GraphicsCreateFromHWND($hGUI)

$hImg = _GDIPlus_BitmapCreateFromScan0($iWndSize, $iWndSize)

$pDev_Scan0 = _CUDA_MemAlloc(4*$iWndSize*$iWndSize)
_CUDA_ParamSetv($hFunction, 8, $pDev_Scan0, 4) ; Ptr to Scan0 data

_Render()

While 1
	Switch GUIGetMsg()
		Case -3
			ExitLoop
		Case $cidClick
			$cInfo = GUIGetCursorInfo()
			$fSize = ($Range[1]-$Range[0])/4
			$fCenterX = ($cInfo[0]/$iWndSize)*($Range[1]-$Range[0]) + $Range[0]
			$fCenterY = ($cInfo[1]/$iWndSize)*($Range[3]-$Range[2]) + $Range[2]

			$Range[0] = $fCenterX - $fSize
			$Range[1] = $fCenterX + $fSize
			$Range[2] = $fCenterY - $fSize
			$Range[3] = $fCenterY + $fSize

			_PrepRange()
			_Render()
	EndSwitch
WEnd

_GDIPlus_ImageDispose($hImg)

_GDIPlus_GraphicsDispose($hGpx)
_GDIPlus_Shutdown()

; Clean up
_CUDA_MemFree($pDev_Range)
_CUDA_MemFree($pDev_Scan0)

_CUDA_CtxDetach($hContext)

_CUDA_Shutdown()

Func _PrepRange()
	$tRange = DllStructCreate("float data[4]")
	For $i = 0 To 3
		DllStructSetData($tRange, "data", $Range[$i], $i+1)
	Next

	; Copy data from RAM to GPU
	_CUDA_MemcpyHtoD(DllStructGetPtr($tRange), $pDev_Range, DllStructGetSize($tRange))
EndFunc

Func _Render()
	; Get Scan0
	$tBData = _GDIPlus_BitmapLockBits($hImg, 0, 0, $iWndSize, $iWndSize, BitOR($GDIP_ILMWRITE, $GDIP_ILMREAD), $GDIP_PXF32ARGB)
	$pScan0 = DllStructGetData($tBData, "Scan0")

	; Copy to GPU, run the module and copy back to RAM
	_CUDA_MemcpyHtoD($pScan0, $pDev_Scan0, 4*$iWndSize*$iWndSize)
	_CUDA_Launch($hFunction, $iWndSize, $iWndSize)
	_CUDA_MemcpyDtoH($pDev_Scan0, $pScan0, 4*$iWndSize*$iWndSize)

	; Unlock bitmap data & render
	_GDIPlus_BitmapUnlockBits($hImg, $tBData)
	_GDIPlus_GraphicsDrawImageRect($hGpx, $hImg, 0, 0, $iWndSize, $iWndSize)
EndFunc
