ABV Posted July 6, 2011 Share Posted July 6, 2011 For Fun, here is a simple example of an Adaptive heuristic critic Reinforcement learning This is an early form of new machine intelligence (AI) that has its roots in psychology. This behavioural response has been adapted to the machine world where a positive or negative reinforcement to an action is a point scoring system. If the action was positive the score increases and if the action was negative the score reduces. In-turn this affects the likely hood of that action being repeated. The objective is for an agent is to explore the grid world by doing so develop a cognitive map. This map will then statistically bias the agent to select actions that move it closer to the goal. For about the first 50 or so trails the agent will wonder the grid world aimlessly, but you may notice that it slowly gets better at finding the goal This script is based on work by: Anthony G. Pipe, Reinforcement learning and knowledge transformation in mobile robotics Richard S. Sutton, Reinforcement Learning Architectures expandcollapse popup#include <GUIConstantsEx.au3> #include <Array.au3> #include <WindowsConstants.au3> #include <staticconstants.au3> Local $iGridSize = 20 Local $NumberGrid [$iGridSize][$iGridSize] Local $VisualGrid [$iGridSize][$iGridSize] Local $VisualGridData [$iGridSize][$iGridSize] Local $VOffset = 0 Local $HOffset = 0 Local $x Local $y Local $x_1 = -1 Local $y_1 = -1 Local $x_2 = -1 Local $y_2 = -1 Local $x_3 = -1 Local $y_3 = -1 Local $Trials Local $Attempts Local $rLamdba = 0.22 Local $rAlpha = 0.1 $GUI = GUICreate("Adaptive heuristic critic - Reinforcement learning", 525, 600) ;Create Grid For $y = 0 To ($iGridSize-1) For $x = 0 To ($iGridSize-1) $VisualGrid[$y][$x] = GUICtrlCreateInput("", 25+ $HOffset, 25 + $VOffset, 25, 25, BitOr($SS_CENTER,$SS_CENTERIMAGE));BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $VOffset += 24 Next $HOffset += 24 $VOffset = 0 ;ConsoleWrite($top & @CRLF) Next ;create labels and Inputs GUICtrlSetFont(-1, 12, 800, 1, "Times New Roman") GUICtrlCreateLabel("Number of Attempts", 300, 527, 175, 20) $GUIAttempts = GUICtrlCreateInput("", 400, 525, 75, 20,BitOr($SS_CENTER,$SS_CENTERIMAGE)) GUICtrlCreateLabel("Number of Trials", 300, 552, 175, 20) $GUITrials = GUICtrlCreateInput("", 400, 550, 75, 20,BitOr($SS_CENTER,$SS_CENTERIMAGE)) GUICtrlCreateLabel("G = Goal, A = Agent “Reinforcement is when an event following an action by an entity affects the entity’s tendency to perform the action again.” Skinner", 20, 525, 250, 500) ;Set Font and Initalise to empty For $y = 0 To ($iGridSize-1) For $x = 0 To ($iGridSize-1) GUICtrlSetFont($VisualGrid[$y][$x], 12, 800, 1, "Times New Roman") GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $VOffset += 24 Next $HOffset += 24 Next ;Initalise Number Grid to random data For $y = 0 to ($iGridSize-1) Step 1 For $x = 0 to ($iGridSize-1) Step 1 $NumberGrid[$y][$x] = random(0.1,1) Next Next GUISetState() while GUIGetMsg() <> -3 Local $arXY[2] sleep(50) ;Set Goal and reward Position $VisualGridData[$iGridSize/2][$iGridSize/2] = "G" GUICtrlSetData($VisualGrid[$iGridSize/2][$iGridSize/2],$VisualGridData[$iGridSize/2][$iGridSize/2]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $NumberGrid[$iGridSize/2][$iGridSize/2] = 500 ;Create Some Obstacles ;Set Random Agnet Start Point $x = Random(0,($iGridSize-1),1) $y = Random(0,($iGridSize-1),1) $VisualGridData[$y][$x] = "A" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Attempts = 0 While (($x <> $iGridSize/2) OR ($y <> $iGridSize/2)) sleep(60) $x_3 = $x_2 $y_3 = $y_2 $x_2 = $x_1 $y_2 = $y_1 $y_1 = $y $x_1 = $x ;Delete old agents position $VisualGridData[$y_1][$x_1] = "" GUICtrlSetData($VisualGrid[$y_1][$x_1],$VisualGridData[$y_1][$x_1]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) ;Determine the next position using a roulette wheel selection NewPosition($x,$y) ;Pay back credit using Temporal Difference Algorithm TDLearning($x,$y,$x_1,$y_1,$x_2,$y_2) ;Draw agents new position $VisualGridData[$y][$x] = "A" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Attempts += 1 GUICtrlSetData($GUIAttempts,$Attempts) WEnd $VisualGridData[$y][$x] = "" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Trials += 1 ;~ IF $Trials = 100 Then ;This is here for debug ;~ _arraydisplay($NumberGrid) ;~ $Trials = 0 ;~ EndIf GUICtrlSetData($GUITrials,$Trials) WEnd Func NewPosition(ByRef $x,ByRef $y) Local $rRX[4] $rRX = Select4Value($x,$y) Roulette($rRX, $x,$y) EndFunc Func TDLearning($x,$y,$x_1,$y_1,$x_2,$y_2) IF $x_3 <> -1 Then $NumberGrid[$y_3][$x_3] = $NumberGrid[$y_3][$x_3]+((($NumberGrid[$y_2][$x_2] - $NumberGrid[$y_3][$x_3]) + (($NumberGrid[$y_1][$x_1] - $NumberGrid[$y_2][$x_2])*$rLamdba) + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*($rLamdba*$rLamdba)))*$rAlpha) EndIf IF $x_2 <> -1 Then $NumberGrid[$y_2][$x_2] = $NumberGrid[$y_2][$x_2] + ((($NumberGrid[$y_1][$x_1] - $NumberGrid[$y_2][$x_2])*$rLamdba) + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*$rLamdba)*$rAlpha) EndIf IF $x_1 <> -1 Then $NumberGrid[$y_1][$x_1] = $NumberGrid[$y_1][$x_1] + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*$rAlpha) EndIf EndFunc Func Roulette($rRX, ByRef $x, ByRef $y) Local $RouletteMax Local $RouletteSpin Local $Rxplus1 = $rRX[0] Local $Rxsub1 = $rRX[1] Local $Ryplus1 = $rRX[2] Local $Rysub1 = $rRX[3] $RouletteMax = $Rxplus1 + $Rxsub1 + $Ryplus1 + $Rysub1 $RouletteSpin = Random(0,$RouletteMax) If $RouletteSpin >= 0 AND $RouletteSpin <= $Rxplus1 Then IF $x+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x+1 $y=$y Endif ElseIf $RouletteSpin >= $Rxplus1 AND $RouletteSpin <= $Rxplus1+$Rxsub1 Then IF $x-1 < 0 Then $x=$x $y=$y Else $x=$x-1 $y=$y Endif ElseIf $RouletteSpin >= $Rxplus1+$Rxsub1 AND $RouletteSpin <= $Rxplus1+$Rxsub1+$Ryplus1 Then IF $y+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x $y=$y+1 Endif ElseIf $RouletteSpin >= $Rxplus1+$Rxsub1+$Ryplus1 AND $RouletteSpin <= $Rxplus1 + $Rxsub1 + $Ryplus1 + $Rysub1 Then IF $y-1 < 0 Then $x=$x $y=$y Else $x=$x $y=$y-1 Endif Else $x=$x $y=$y EndIF EndFunc Func Select4Value(ByRef $x, ByRef $y) Local $rRX[4] IF $x+1 > ($iGridSize-1) Then $rRX[0] = 0 Else $x=$x+1 $rRX[0] = $NumberGrid[$y][$x] Endif IF $x-1 < 0 Then $rRX[1] = 0 Else $x=$x-1 $rRX[1] = $NumberGrid[$y][$x] Endif IF $y+1 > ($iGridSize-1) Then $rRX[2] = 0 Else $y=$y+1 $rRX[2] = $NumberGrid[$y][$x] Endif IF $y-1 < 0 Then $rRX[3] = 0 Else $y=$y-1 $rRX[3] = $NumberGrid[$y][$x] Endif ;_ArrayDisplay($rRX) ;debug Return $rRX EndFunc Link to comment Share on other sites More sharing options...
Shaggi Posted July 6, 2011 Share Posted July 6, 2011 Lol. that is nice. After 70~ or so it definately gets better, with all goals under 100, and most under 60. I'm studying the logic atm, but not quite sure i understand how it works.. It keeps a table over successes, and can valuate, like, "if i'm at that position, i previously suceeded quicker with going left than going right"? Ever wanted to call functions in another process? ProcessCall UDFConsole stuff: Console UDFC Preprocessor for AutoIt OMG Link to comment Share on other sites More sharing options...
ABV Posted July 6, 2011 Author Share Posted July 6, 2011 Lol. that is nice. After 70~ or so it definately gets better, with all goals under 100, and most under 60. I'm studying the logic atm, but not quite sure i understand how it works..It keeps a table over successes, and can valuate, like, "if i'm at that position, i previously suceeded quicker with going left than going right"?The agent uses a probabilistic approach to determining where to move next, this is a strong exploratory, but not too deterministic. Another approach would be to move to the square with the highest value.The agent should look at the up, down, left and right squares then using a roulette wheel it will select the next square to move to The roulette wheel works like this. If the values of the 4 squares are:Up = 100Down = 10Left = 5Right = 1 A random value is created from 0 to the sum (0 - 116). If the random value is 0-1 the right square is chosen, if the value is 1-5 then left is chosen, and so on. As you can see the chances of the random value being between 10-100 are much higher than 0-1, but every square has a chance of being selected, squares with a high differential value have a much better chance.The program implements TD(λ) credit assignment, credit is passed back to in a chain as the agent move around the grid. This credit payback builds the cognitive map that is used by the action policy to determine the next state.If you see a deviation from the explanation and the code, it is most likely a bug.. Link to comment Share on other sites More sharing options...
ABV Posted July 6, 2011 Author Share Posted July 6, 2011 (edited) Hi I have fixed a bug, and added the best4 policy as a choice; my GUI skills are not the best! You can now select the roulette or best4. Best4, simply moves to the location with the largest value. This is deterministic but not as explorative expandcollapse popup#include <GUIConstantsEx.au3> #include <Array.au3> #include <WindowsConstants.au3> #include <staticconstants.au3> Opt("GUIOnEventMode", 1) Local $iGridSize = 20 Local $NumberGrid [$iGridSize][$iGridSize] Local $VisualGrid [$iGridSize][$iGridSize] Local $VisualGridData [$iGridSize][$iGridSize] Local $VOffset = 0 Local $HOffset = 0 Local $x Local $y Local $x_1 = -1 Local $y_1 = -1 Local $x_2 = -1 Local $y_2 = -1 Local $x_3 = -1 Local $y_3 = -1 Local $Trials Local $Attempts Local $rLamdba = 0.22 Local $rAlpha = 0.1 Local $bradio $GUI = GUICreate("Adaptive heuristic critic - Reinforcement learning", 525, 620) GUISetOnEvent($GUI_EVENT_CLOSE, "SpecialEvents") ;Create Grid For $y = 0 To ($iGridSize-1) For $x = 0 To ($iGridSize-1) $VisualGrid[$y][$x] = GUICtrlCreateInput("", 25+ $HOffset, 25 + $VOffset, 25, 25, BitOr($SS_CENTER,$SS_CENTERIMAGE));BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $VOffset += 24 Next $HOffset += 24 $VOffset = 0 ;ConsoleWrite($top & @CRLF) Next ;create labels and Inputs GUICtrlSetFont(-1, 12, 800, 1, "Times New Roman") GUICtrlCreateLabel("Number of Attempts", 300, 527, 175, 20) $GUIAttempts = GUICtrlCreateInput("", 400, 525, 75, 20,BitOr($SS_CENTER,$SS_CENTERIMAGE)) GUICtrlCreateLabel("Number of Trials", 300, 552, 175, 20) $GUITrials = GUICtrlCreateInput("", 400, 550, 75, 20,BitOr($SS_CENTER,$SS_CENTERIMAGE)) GUICtrlCreateLabel("G = Goal, A = Agent “Reinforcement is when an event following an action by an entity affects the entity’s tendency to perform the action again.” Skinner", 20, 525, 250, 500) ;Create an "OK" button $OK_Btn = GUICtrlCreateButton("Go", 400, 595, 75, 20) GUICtrlSetOnEvent($OK_Btn, "OKPressed") $radio = GUICtrlCreateCheckbox("Policy Roulette (T) / Highest (F)", 300, 575, 180, 20) ;Set Font and Initalise to empty For $y = 0 To ($iGridSize-1) For $x = 0 To ($iGridSize-1) GUICtrlSetFont($VisualGrid[$y][$x], 12, 800, 1, "Times New Roman") GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $VOffset += 24 Next $HOffset += 24 Next ;Initalise Number Grid to random data For $y = 0 to ($iGridSize-1) Step 1 For $x = 0 to ($iGridSize-1) Step 1 $NumberGrid[$y][$x] = 0.1;random(0.1,1) Next Next GUISetState() While (1) Sleep(10) Wend Func SpecialEvents() ;Destroy the GUI including the controls GUIDelete() ;Exit the script Exit EndFunc Func OKPressed() $bradio = GUICtrlRead($radio) while GUIGetMsg() <> -3 Local $arXY[2] sleep(50) ;Set Goal and reward Position $VisualGridData[$iGridSize/2][$iGridSize/2] = "G" GUICtrlSetData($VisualGrid[$iGridSize/2][$iGridSize/2],$VisualGridData[$iGridSize/2][$iGridSize/2]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $NumberGrid[$iGridSize/2][$iGridSize/2] = 500 ;Create Some Obstacles ;Set Random Agnet Start Point $x = Random(0,($iGridSize-1),1) $y = Random(0,($iGridSize-1),1) $VisualGridData[$y][$x] = "A" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Attempts = 0 While (($x <> $iGridSize/2) OR ($y <> $iGridSize/2)) sleep(60) $x_3 = $x_2 $y_3 = $y_2 $x_2 = $x_1 $y_2 = $y_1 $y_1 = $y $x_1 = $x ;Delete old agents position $VisualGridData[$y_1][$x_1] = "" GUICtrlSetData($VisualGrid[$y_1][$x_1],$VisualGridData[$y_1][$x_1]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) ;Determine the next position using a roulette wheel or higher 4 NewPosition($x,$y,$x_1,$y_1,$bradio) ;Pay back credit using Temporal Difference Algorithm TDLearning($x,$y,$x_1,$y_1,$x_2,$y_2) ;Draw agents new position $VisualGridData[$y][$x] = "A" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Attempts += 1 GUICtrlSetData($GUIAttempts,$Attempts) WEnd $VisualGridData[$y][$x] = "" GUICtrlSetData($VisualGrid[$y][$x],$VisualGridData[$y][$x]);BitOr($SS_CENTER,$SS_CENTERIMAGE,$SS_BLACKFRAME)) $Trials += 1 ;~ IF $Trials = 100 Then ;This is here for debug ;~ _arraydisplay($NumberGrid) ;~ $Trials = 0 ;~ EndIf GUICtrlSetData($GUITrials,$Trials) WEnd EndFunc Func NewPosition(ByRef $x,ByRef $y, $x_1, $y_1, $bradio) Local $rRX[4] $rRX = Select4Value($x,$y) IF $bradio = 4 Then Best4($rRX, $x,$y,$x_1, $y_1) Else Roulette($rRX, $x,$y) EndIf EndFunc Func TDLearning($x,$y,$x_1,$y_1,$x_2,$y_2) IF $x_3 <> -1 Then $NumberGrid[$y_3][$x_3] = $NumberGrid[$y_3][$x_3]+((($NumberGrid[$y_2][$x_2] - $NumberGrid[$y_3][$x_3]) + (($NumberGrid[$y_1][$x_1] - $NumberGrid[$y_2][$x_2])*$rLamdba) + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*($rLamdba*$rLamdba)))*$rAlpha) EndIf IF $x_2 <> -1 Then $NumberGrid[$y_2][$x_2] = $NumberGrid[$y_2][$x_2] + ((($NumberGrid[$y_1][$x_1] - $NumberGrid[$y_2][$x_2])*$rLamdba) + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*$rLamdba)*$rAlpha) EndIf IF $x_1 <> -1 Then $NumberGrid[$y_1][$x_1] = $NumberGrid[$y_1][$x_1] + (($NumberGrid[$y][$x] - $NumberGrid[$y_1][$x_1])*$rAlpha) EndIf EndFunc Func Best4($rRX, ByRef $x, ByRef $y,$x_1, $y_1) Local $Best4MaxIndex $Best4MaxIndex = _ArrayMaxIndex($rRX) IF $x_1 = $x AND $y_1 = $y Then ;Board... $NumberGrid[$y][$x] = $NumberGrid[$y][$x] - ($NumberGrid[$y][$x]*0.05) EndIf If $Best4MaxIndex = 0 Then IF $x+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x+1 $y=$y Endif ElseIf $Best4MaxIndex = 1 Then IF $x-1 < 0 Then $x=$x $y=$y Else $x=$x-1 $y=$y Endif ElseIf $Best4MaxIndex = 2 Then IF $y+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x $y=$y+1 Endif ElseIf $Best4MaxIndex = 3 Then IF $y-1 < 0 Then $x=$x $y=$y Else $x=$x $y=$y-1 Endif Else $x=$x $y=$y EndIF EndFunc Func Roulette($rRX, ByRef $x, ByRef $y) Local $RouletteMax Local $RouletteSpin Local $Rxplus1 = $rRX[0] Local $Rxsub1 = $rRX[1] Local $Ryplus1 = $rRX[2] Local $Rysub1 = $rRX[3] $RouletteMax = $Rxplus1 + $Rxsub1 + $Ryplus1 + $Rysub1 $RouletteSpin = Random(0,$RouletteMax) If $RouletteSpin >= 0 AND $RouletteSpin <= $Rxplus1 Then IF $x+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x+1 $y=$y Endif ElseIf $RouletteSpin >= $Rxplus1 AND $RouletteSpin <= $Rxplus1+$Rxsub1 Then IF $x-1 < 0 Then $x=$x $y=$y Else $x=$x-1 $y=$y Endif ElseIf $RouletteSpin >= $Rxplus1+$Rxsub1 AND $RouletteSpin <= $Rxplus1+$Rxsub1+$Ryplus1 Then IF $y+1 > ($iGridSize-1) Then $x=$x $y=$y Else $x=$x $y=$y+1 Endif ElseIf $RouletteSpin >= $Rxplus1+$Rxsub1+$Ryplus1 AND $RouletteSpin <= $Rxplus1 + $Rxsub1 + $Ryplus1 + $Rysub1 Then IF $y-1 < 0 Then $x=$x $y=$y Else $x=$x $y=$y-1 Endif Else $x=$x $y=$y EndIF EndFunc Func Select4Value(ByRef $x, ByRef $y) Local $rRX[4] IF $x+1 > ($iGridSize-1) Then $rRX[0] = 0 Else ;$x=$x+1 $rRX[0] = $NumberGrid[$y][$x+1] Endif IF $x-1 < 0 Then $rRX[1] = 0 Else ;$x=$x-1 $rRX[1] = $NumberGrid[$y][$x-1] Endif IF $y+1 > ($iGridSize-1) Then $rRX[2] = 0 Else ;$y=$y+1 $rRX[2] = $NumberGrid[$y+1][$x] Endif IF $y-1 < 0 Then $rRX[3] = 0 Else ;$y=$y-1 $rRX[3] = $NumberGrid[$y-1][$x] Endif ;_ArrayDisplay($rRX) ;debug Return $rRX EndFunc Edited July 6, 2011 by ABV Link to comment Share on other sites More sharing options...
E1M1 Posted July 6, 2011 Share Posted July 6, 2011 By the end of day it reached over 1450 but still sometimes took up to 1000 (and sometimes even more) movements to find G edited Link to comment Share on other sites More sharing options...
ABV Posted July 18, 2011 Author Share Posted July 18, 2011 By the end of day it reached over 1450 but still sometimes took up to 1000 (and sometimes even more) movements to find GThe Roulette is a probabilistic policy it is good at exploring a grid world and gravitates to the goal. The best 4 is deterministic and tents to drive a path to the goal.This is a very simple form of AI, the agent tends to repeats pervious rewarding behaviour, (learning) and can unlearn old behaviour if the environment changes.If this was a vacuum cleaner you can see from your result with the Roulette policy the entire floor would get a clean but most of the time would be spend in a more rewarding dirty area. If you were trying to solve a maze then best 4 would work well Link to comment Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now