#include "../../CUDA.au3"
#include <WinAPIEx.au3>
#include <GDIPlus.au3>

$hGUI = GUICreate("CUDA image filtering", 512, 512)
GUISetState()

; Initialize the CUDA Driver API
_CUDA_Startup()
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load nvcuda.dll")
_CUDA_Init()
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't init CUDA.")

; Set up the device & create a context
$hDevice = _CUDA_DeviceGet(0)
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load the device")
$hContext = _CUDA_CtxCreate($hDevice)


; Load the module and function
$hModule = _CUDA_ModuleLoad("invert.cubin")
If @error Then Exit MsgBox(16, "CUDA API Error", "Couldn't load invert.cubin")
$hFunction = _CUDA_ModuleGetFunction($hModule, "kernel")

; Set up parameters
_CUDA_FuncSetBlockShape($hFunction, 1, 1, 1)
_CUDA_FuncSetSharedSize($hFunction, 4)
_CUDA_ParamSetSize($hFunction, 4)

; GDI+ stuff
_GDIPlus_Startup()
$hGpx = _GDIPlus_GraphicsCreateFromHWND($hGUI)

$hImg = _GDIPlus_ImageLoadFromFile("lena.png")

$iW = _GDIPlus_ImageGetWidth($hImg)
$iH = _GDIPlus_ImageGetHeight($hImg)

; Lock bitmap data and get the Scan0 ptr
$tBData = _GDIPlus_BitmapLockBits($hImg, 0, 0, $iW, $iH, BitOR($GDIP_ILMWRITE, $GDIP_ILMREAD), $GDIP_PXF32ARGB)
$pScan0 = DllStructGetData($tBData, "Scan0")

; Allocate memory on the GPU
$pDev_Scan0 = _CUDA_MemAlloc(4*$iW*$iH)
_CUDA_ParamSetv($hFunction, 0, $pDev_Scan0, 4) ; Ptr to Scan0 data

; Copy to GPU, run the module and copy back to RAM
_CUDA_MemcpyHtoD($pScan0, $pDev_Scan0, 4*$iW*$iH)
_CUDA_Launch($hFunction, $iW, $iH)
_CUDA_MemcpyDtoH($pDev_Scan0, $pScan0, 4*$iW*$iH)

; Unlock data and render
_GDIPlus_BitmapUnlockBits($hImg, $tBData)
_GDIPlus_GraphicsDrawImageRect($hGpx, $hImg, 0, 0, $iW, $iH)

While 1
	Switch GUIGetMsg()
		Case -3
			ExitLoop
	EndSwitch
WEnd

_GDIPlus_ImageDispose($hImg)

_GDIPlus_GraphicsDispose($hGpx)
_GDIPlus_Shutdown()

_CUDA_MemFree($pDev_Scan0)

_CUDA_CtxDetach($hContext)

_CUDA_Shutdown()

