MS Classic Bounce Multithreading Example with 128 threads

Beege · October 12, 2019

Here is an old goodie from ms demonstrating concepts behind multithreading and using mutexes to control sharing the screen. Its unfortunately just a console application so you have to press compile (f7) to run (can get annoying if you want to play with the code) but still pretty cool :). Each little question mark box (could be any character (used to be a smiley face in win 7)) is its own thread keeping track of its own coordinates. Each thread shares the screenmutex by kinda waiting in line for ownership of it. When the thread gains control it updates the screen, then releases the mutex for the next thread.

bounce.PNG.f2e4fbb3ea0be338d3b6f189e5e55e8c.PNG

First I wrote it in pure autoit to confirm all working as expected. The Console functions actually threw me for a loop. They actual want the whole value of the coord structs and not a ptr to it so that "struct" without a * was a little uncommon. Below au3 code is just the lonely cell bouncing around.

Func _BounceAU3()

    ;set a random starting id. we use this to rotate the colors
    Local $iMyID = Random(1, 15, 1)
    Local $tMyCell = DllStructCreate('char mc'), $tOldCell = DllStructCreate('char oc')
    Local $tMyAttrib = DllStructCreate('word ma'), $tOldAttrib = DllStructCreate('word oa')
    Local $tCoords = DllStructCreate($tagCOORD), $tOld = DllStructCreate($tagCOORD)
    Local $tDelta = DllStructCreate($tagCOORD)

    ;Random start and delta values
    $tCoords.X = Random(0, 119, 1)
    $tCoords.Y = Random(0, 29, 1)
    $tDelta.X = Random(-3, 3, 1)
    $tDelta.Y = Random(-3, 3, 1)

    ;set character/cell attributes
    $tMyCell.mc = $iMyID > 16 ? 0x01 : 0x02 ; doesnt seem to make a differnce in windows 10
    $tMyAttrib.ma = BitAND($iMyID, 0x0F) ; Set the character color

    Do
        ;check the last position values
        DllCall('kernel32.dll', "bool", "ReadConsoleOutputCharacter", "handle", $g_hStdHandle, "struct*", $tOldCell, "dword", 1, "struct", $tOld, "dword*", 0)
        DllCall('kernel32.dll', "bool", "ReadConsoleOutputAttribute", "handle", $g_hStdHandle, "struct*", $tOldAttrib, "dword", 1, "struct", $tOld, "dword*", 0)

        ;if the last postion was this cell, blank/empty the cell. (Otherwise its been taken over by another thread)
        If ($tOldCell.oc = $tMyCell.mc) And ($tOldAttrib.oa = $tMyAttrib.ma) Then
            DllCall('kernel32.dll', "bool", "WriteConsoleOutputCharacter", "handle", $g_hStdHandle, "byte*", 0x20, "dword", 1, "struct", $tOld, "dword*", 0)
        EndIf

        ;write the current cell
        DllCall('kernel32.dll', "bool", "WriteConsoleOutputCharacter", "handle", $g_hStdHandle, "struct*", $tMyCell, "dword", 1, "struct", $tCoords, "dword*", 0)
        DllCall('kernel32.dll', "bool", "WriteConsoleOutputAttribute", "handle", $g_hStdHandle, "struct*", $tMyAttrib, "dword", 1, "struct", $tCoords, "dword*", 0)

        ;update coords
        $tOld.X = $tCoords.X
        $tOld.Y = $tCoords.Y
        $tCoords.X += $tDelta.X
        $tCoords.Y += $tDelta.Y

        ;change directions if we are out of bounds
        If $tCoords.X < 0 Or $tCoords.X >= 120 Then $tDelta.X *= -1
        If $tCoords.Y < 0 Or $tCoords.Y >= 30 Then $tDelta.Y *= -1

        Sleep(75)

    Until GUIGetMsg() = -3
EndFunc   ;==>_BounceAU3

From there the that function converted into assembly so we can call as a thread. The only real differences are the extra parameters we passing as a structure and I also generate the random starting values in autoit instead, then pass them to the function. Here is what the main assembly function looks like. I added comments for each peice of code from au3 that we are translating:

_('procf _Bounce uses ebx, pParms')
    ;
    ;   create the local variables
    _(' locals')
    _('     BlankCell db 32') ; this first group covers the variables from the original script
    _('     MyCell db ?')
    _('     OldCell db ?')
    _('     MyAtt dw ?')
    _('     OldAtt dw ?')
    _('     tCoords COORD')
    _('     tDelta COORD')
    _('     tOld COORD')
    _('     bytesread dw ?')
    ;
    _('     iMyID dw ?') ;  this group of local vars cover holding all the other paramerters we are passing in tParms
    _('     g_hScreenMutex dd ?')
    _('     g_hRunMutex dd ?')
    _('     g_hStdHandle dd ?')
    _('     pfWaitForSingleObject dd ?')
    _('     pfReleaseMutex dd ?')
    _('     pfReadChar dd ?')
    _('     pfReadAttr dd ?')
    _('     pfWriteChar dd ?')
    _('     pfWriteAttr dd ?')
    _(' endl')
    ;
    ;all of these push/pops are to transfer the rest of variables from tParms structure to the local variables we created
    ;first mov the structure address into ebx
    _(' mov ebx, [pParms]')
    ;
    ; now push and pop the values into the variables
    ; use _winapi_displaystruct() to view all the offsets being used in the [ebx+offset] lines
    _(' pushw [ebx]') ;
    _(' popw word[tCoords+COORD.X]')
    _(' pushw word[ebx+2]') ;
    _(' popw word[tCoords+COORD.Y]')
    _(' pushw word[ebx+4]') ;
    _(' popw word[tDelta+COORD.X]')
    _(' pushw word[ebx+6]') ;
    _(' popw word[tDelta+COORD.Y]')
    _(' pushw word[ebx+8]') ;
    _(' popw word[iMyID]')
    _(' push dword[ebx+12]') ;
    _(' pop dword[g_hScreenMutex]')
    _(' push dword[ebx+16]') ;
    _(' pop dword[g_hRunMutex]')
    _(' push dword[ebx+20]') ;
    _(' pop dword[g_hStdHandle]')
    _(' push dword[ebx+24]') ;
    _(' pop dword[pfWaitForSingleObject]')
    _(' push dword[ebx+28]') ;
    _(' pop dword[pfReleaseMutex]')
    _(' push dword[ebx+32]') ;
    _(' pop dword[pfReadChar]')
    _(' push dword[ebx+36]') ;
    _(' pop dword[pfReadAttr]')
    _(' push dword[ebx+40]') ;
    _(' pop dword[pfWriteChar]')
    _(' push dword[ebx+44]') ;
    _(' pop dword[pfWriteAttr]')

    _('.if word[iMyID] > 16') ; $tMyCell.mc = $iMyID > 16 ? 0x01 : 0x02 (no difference in windows 10)
    _('     mov word[MyCell], 1')
    _('.else')
    _('     mov word[MyCell], 2')
    _('.endif')
    ;
    _('pushw word[iMyID]') ;  $tMyAttrib.ma = BitAND($iMyID, 0x0F)
    _('popw word[MyAtt]')
    _('and word[MyAtt], 15')
    ;
    _('.repeat') ; do
    ;
    ;       Wait infinetly for the screen mutex to be available, then take ownership
    _('     invoke pfWaitForSingleObject, [g_hScreenMutex], -1')
    ;
    ;       DllCall('kernel32.dll', "bool", "WriteConsoleOutputCharacter", "handle", $hStdHandle, "byte*", 0x20, "dword", 1, "struct", $tOld, "dword*", 0)
    _('     invoke pfReadChar, [g_hStdHandle], addr OldCell, 1, dword[tOld], addr bytesread') ;
    _('     invoke pfReadAttr, [g_hStdHandle], addr OldAtt, 1, dword[tOld], addr bytesread') ;
    ;
    _('     mov al, byte[MyCell]') ;If ($tOldCell.oc = $tMyCell.mc) And ($tOldAttrib.oa = $tMyAttrib.ma) Then
    _('     mov cl, byte[MyAtt]')
    _('     .if (byte[OldCell] = al) & (byte[OldAtt] = cl)')
    _('         invoke pfWriteChar, [g_hStdHandle], addr BlankCell, 1, dword[tOld], addr bytesread')
    _('     .endif')
    ;
    ;       DllCall('kernel32.dll', "bool", "WriteConsoleOutputCharacter", "handle", $hStdHandle, "struct*", $tMyCell, "dword", 1, "struct", $tCoords, "dword*", 0)
    _('     invoke pfWriteChar, [g_hStdHandle], addr MyCell, 1, dword[tCoords], addr bytesread')
    _('     invoke pfWriteAttr, [g_hStdHandle], addr MyAtt, 1, dword[tCoords], addr bytesread')
    ;
    _('     pushw word[tCoords+COORD.X]') ;$tOld.X = $tCoords.X
    _('     popw word[tOld+COORD.X]')
    ;
    _('     pushw word[tCoords+COORD.Y]') ;$tOld.Y = $tCoords.Y
    _('     popw word[tOld+COORD.Y]')

    _('     mov ax, word[tDelta+COORD.X]') ; $tCoords.X += $tDelta.X
    _('     add word[tCoords+COORD.X], ax')
    ;
    _('     mov ax, word[tDelta+COORD.Y]') ; $tCoords.Y += $tDelta.Y
    _('     add word[tCoords+COORD.Y], ax')
    ;
    ;       If $tCoords.X < 0 Or $tCoords.X >= 120 Then $tDelta.X *= -1
    _('     .if (word[tCoords+COORD.X] < 0 | word[tCoords+COORD.X] >= 120)')
    _('         neg word[tDelta+COORD.X]')
    _('     .endif')
    _('     .if (word[tCoords+COORD.Y] < 0 | word[tCoords+COORD.Y] >= 30)')
    _('         neg word[tDelta+COORD.Y]')
    _('     .endif')
    ;
    ;       release the screen mutex
    _('     invoke pfReleaseMutex, [g_hScreenMutex]')
    ;
    ;       wait 100 ms for the Runmutex to be available.
    _('     invoke pfWaitForSingleObject, [g_hRunMutex], 100')
    ;
    ;   a return of 258 means it timed out waiting and that the run mutex (owned by the main autoit thread) is still alive.
    ;   when the run mutex handle gets closed this will return a fail or abandonded.
    _('.until eax <> 258') ;
    ;exit thread
    _(' ret')
    _('endp')

And finally how we call that assembled function from autoit to create the theads:

;create mutex for sharing the screen thats not owned by main thread
    Global $g_hScreenMutex = _WinAPI_CreateMutex('', False) ;
    ;create mutex that tells the threads to exit that is owned by main thread
    Global $g_hRunMutex = _WinAPI_CreateMutex('', True) 
    ...
    ...
    ;assemble function
    Local $tBinExec = _fasmg_Assemble($g_sFasm, False) ;Local $tBinExec = _fasmg_CompileAu3($g_sFasm)
    If @error Then Exit (ConsoleWrite($tBinExec & @CRLF))

    ;this is struct is for all the values Im passing to the thread.
    ;this will hold are random start x,y,delta values, handles, and pointers to functions called within the thread
    $tParms = DllStructCreate('short start[4];word myid;dword hands[3];ptr funcs[6]')
    $tParms.start(1) = Random(0, 119, 1)
    $tParms.start(2) = Random(0, 29, 1)
    $tParms.start(3) = Random(-3, 3, 1)
    $tParms.start(4) = Random(-3, 3, 1)
    $tParms.myid = 1
    $tParms.hands(1) = $g_hScreenMutex
    $tParms.hands(2) = $g_hRunMutex
    $tParms.hands(3) = $g_hStdHandle
    $tParms.funcs(1) = _GPA('kernel32.dll', 'WaitForSingleObject')
    $tParms.funcs(2) = _GPA('kernel32.dll', 'ReleaseMutex')
    $tParms.funcs(3) = _GPA('kernel32.dll', 'ReadConsoleOutputCharacterA')
    $tParms.funcs(4) = _GPA('kernel32.dll', 'ReadConsoleOutputAttribute')
    $tParms.funcs(5) = _GPA('kernel32.dll', 'WriteConsoleOutputCharacterA')
    $tParms.funcs(6) = _GPA('kernel32.dll', 'WriteConsoleOutputAttribute')

    ;create 128 threads with different start values and colors for each one
    For $i = 1 To 128
        $tParms.myid = $i
        $tParms.start(1) = Random(0, 119, 1)
        $tParms.start(2) = Random(0, 29, 1)
        $tParms.start(3) = Random(-3, 3, 1)
        $tParms.start(4) = Random(-3, 3, 1)
        If $tParms.start(3) + $tParms.start(4) = 0 Then $tParms.start(3) = (Mod(@MSEC, 2) ? 1 : -1) ; adjusting non-moving (0,0) delta values..
        DllCall("kernel32.dll", "hwnd", "CreateThread", "ptr", 0, "dword", 0, "struct*", $tBinExec, "struct*", $tParms, "dword", 0, "dword*", 0)
        Sleep(50)
    Next

    MsgBox(262144, '', '128 Threads Created')

    ;Close the run mutex handle. This will cause all the threads to exit
    _WinAPI_CloseHandle($g_hRunMutex)
    _WinAPI_CloseHandle($g_hScreenMutex)

    MsgBox(262144, '', 'Mutex handles closed. All Threads should have exited')
    Exit

The attachment below contains both the compiled and source assembly. To play with the assembly source you need to add the fasmg udf in my sig. The compiled version should not need anything. Let me know if you have any issues.

Special thanks to @trancexx for teaching me this with her clock example

Bounce.zip

Edited October 12, 2019 by Beege

UEZ · October 12, 2019

That's kinda cool code :thumbsup: Thanks for sharing.

After a long time of assembler abstinence I currently try some ASM code to speed up my TGA loader. Currently I'm stuck with x64 ASM code...

Beege · October 12, 2019

Thanks UEZ!

I took a look at your loader and have been getting caught up on TGA format. I cant believe I never heard of it. I love the simple header. Replacing those For loops would definitely speed things up. The parts for me that I always get stuck on are dealing with floats. Have you got asm code working for 32bit yet?

UEZ · October 12, 2019

33 minutes ago, Beege said:

Thanks UEZ!

I took a look at your loader and have been getting caught up on TGA format. I cant believe I never heard of it. I love the simple header. Replacing those For loops would definitely speed things up. The parts for me that I always get stuck on are dealing with floats. Have you got asm code working for 32bit yet?

Well, for the TGA loader loading 32-bit image should be relativ fast as it a 1d loop. Only the 2d loops take very long time for larger images.

Currently I've done the x86 ASM code for 15/16 bit images but for x64 the code doesn't work. E.g. loading a 15-bit image with 2789x3500 dim. using native AutoIt it takes on my machine more than 134992 ms, with ASM 37 ms -> 3.648x faster!

This applies only to 8/15/16/24-bit images whose width is not a divider of 4.

Here the part of the UDF which can be replaced with the non-pro ASM code

Case 15, 16, 24, 32 ;15/16/24/32-bit, as the bitmap format is the same we can use memcpy to copy the pixel data directly to the memory.
                            ;Exeptions are 15/16/24-bit images whose width is not a divider of 4!
            If BitOR($iPxDepth = 15, $iPxDepth = 16, $iPxDepth = 24) And Mod($iW, 4) Then
                Switch $iPxDepth
                    Case 15, 16
                        Local Const $bBinASM1516_x86 = Binary("0x5589E58B5D188B4D1C89C8F7651089452489C8F765148945285389D8D1E08B552801C203550C8B752401C60375200375088B066689024B83FB0075DE5B4983F90075C65DC22400")
                        Local $tBinASM1516_x86 = DllStructCreate("byte asm[" & BinaryLen($bBinASM1516_x86) & "]")
                        $tBinASM1516_x86.asm = $bBinASM1516_x86
                        Local $tMemVar1 = DllStructCreate("dword var"), $tMemVar2 = DllStructCreate("dword var")
                        DllCallAddress("none", DllStructGetPtr($tBinASM1516_x86), _
                                       "ptr", DllStructGetPtr($tSrcBmp), "ptr", DllStructGetPtr($tDestBmp), _
                                       "dword", $iW * 2, "dword", $stride, "dword", $iW - 1, "dword", $iH - 1, "dword", $pitch, _
                                       "ptr", DllStructGetPtr($tMemVar1), "ptr", DllStructGetPtr($tMemVar2))
                    Case 24

The ASM code:

#cs _ASM1516_x86
    use32
    ;pushad

    define tSrcBmp  dword[ebp + 08]
    define tDestBmp dword[ebp + 12]
    define strideS  dword[ebp + 16]
    define strideD  dword[ebp + 20]
    define width    dword[ebp + 24]
    define height   dword[ebp + 28]
    define pitch    dword[ebp + 32]
    define tMemVar1 dword[ebp + 36]
    define tMemVar2 dword[ebp + 40]

    push ebp
    mov ebp, esp

    mov ebx, width ;exc = w - 1
    mov ecx, height ;ecx = h - 1

;~  _ASMDBG_()

    _y:
        mov eax, ecx
        mul strideS
        mov tMemVar1, eax

        mov eax, ecx
        mul strideD
        mov tMemVar2, eax
        push ebx
        _x:
            mov eax, ebx
            shl eax, 1

            mov edx, tMemVar2
            add edx, eax
            add edx, tDestBmp

            mov esi, tMemVar1
            add esi, eax
            add esi, pitch
            add esi, tSrcBmp

            mov eax, [esi]
            mov word[edx], ax

            dec ebx
            cmp ebx, 0
            jne _x
        pop ebx
        dec ecx
        cmp ecx, 0
        jne _y

    pop ebp
    ;popad

    ret 36
#ce _ASM1516_x86

Any idea how to convert is to a working x64 version?

Edited October 12, 2019 by UEZ

Sign In

MS Classic Bounce Multithreading Example with 128 threads

Recommended Posts

Beege

Link to comment

Share on other sites

UEZ

Link to comment

Share on other sites

Beege

Link to comment

Share on other sites

UEZ

Link to comment

Share on other sites

Create an account or sign in to comment

Create an account

Sign in

Recently Browsing 0 members

Similar Content

Ward's Inline FASM UDF

Real Time Assembler

MultiProcess V3 [NO FUNC LIMITS]

Autoit api thread * update

Fastest way to send multiple HTTP requests

Browse

AutoIt Resources

Release

Beta