q learning pas mal avancé

2019-04-13 00:43:54 -04:00 · 2019-04-13 00:43:54 -04:00 · 77caa10f89
commit 77caa10f89
parent aca1c3c599
3 changed files with 74 additions and 30 deletions
--- a/reinforcement/analysis.py
+++ b/reinforcement/analysis.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # analysis.py
 # -----------
 # Licensing Information:  You are free to use or extend these projects for
@ -25,45 +26,45 @@ def question2():
    return answerDiscount, answerNoise

 def question3a():
-    answerDiscount = None
-    answerNoise = None
-    answerLivingReward = None
+    answerDiscount = 0.9
+    answerNoise = 0.1
+    answerLivingReward = -3
    return answerDiscount, answerNoise, answerLivingReward
    # If not possible, return 'NOT POSSIBLE'

 def question3b():
-    answerDiscount = None
-    answerNoise = None
-    answerLivingReward = None
+    answerDiscount = 0.1
+    answerNoise = 0.1
+    answerLivingReward = -2
    return answerDiscount, answerNoise, answerLivingReward
    # If not possible, return 'NOT POSSIBLE'

 def question3c():
-    answerDiscount = None
-    answerNoise = None
-    answerLivingReward = None
+    answerDiscount = 0.9
+    answerNoise = 0
+    answerLivingReward = 0
    return answerDiscount, answerNoise, answerLivingReward
    # If not possible, return 'NOT POSSIBLE'

 def question3d():
-    answerDiscount = None
-    answerNoise = None
-    answerLivingReward = None
+    answerDiscount = 0.1
+    answerNoise = 0.1
+    answerLivingReward = 1
    return answerDiscount, answerNoise, answerLivingReward
    # If not possible, return 'NOT POSSIBLE'

 def question3e():
-    answerDiscount = None
-    answerNoise = None
-    answerLivingReward = None
+    answerDiscount = 0
+    answerNoise = 0
+    answerLivingReward = 1
    return answerDiscount, answerNoise, answerLivingReward
    # If not possible, return 'NOT POSSIBLE'

 def question6():
    answerEpsilon = None
    answerLearningRate = None
-    return answerEpsilon, answerLearningRate
-    # If not possible, return 'NOT POSSIBLE'
+    # return answerEpsilon, answerLearningRate
+    return 'NOT POSSIBLE'

 if __name__ == '__main__':
    print 'Answers to analysis questions:'
--- a/reinforcement/qlearningAgents.py
+++ b/reinforcement/qlearningAgents.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # qlearningAgents.py
 # ------------------
 # Licensing Information:  You are free to use or extend these projects for
@ -43,6 +44,7 @@ class QLearningAgent(ReinforcementAgent):
        ReinforcementAgent.__init__(self, **args)

        "*** YOUR CODE HERE ***"
+        self.q_values = {}

    def getQValue(self, state, action):
        """
@ -51,8 +53,11 @@ class QLearningAgent(ReinforcementAgent):
          or the Q node value otherwise
        """
        "*** YOUR CODE HERE ***"
-        util.raiseNotDefined()
-
+        if (state, action) in self.q_values:
+            return self.q_values.get((state, action))
+        else:
+            return 0
+        # util.raiseNotDefined()

    def computeValueFromQValues(self, state):
        """
@ -62,7 +67,14 @@ class QLearningAgent(ReinforcementAgent):
          terminal state, you should return a value of 0.0.
        """
        "*** YOUR CODE HERE ***"
-        util.raiseNotDefined()
+        legal_actions = self.getLegalActions(state)
+
+        if len(legal_actions) == 0:
+            return 0.0
+
+        return max([self.getQValue(state, action) for action in legal_actions])
+
+        # util.raiseNotDefined()

    def computeActionFromQValues(self, state):
        """
@ -71,7 +83,23 @@ class QLearningAgent(ReinforcementAgent):
          you should return None.
        """
        "*** YOUR CODE HERE ***"
-        util.raiseNotDefined()
+
+        legalActions = self.getLegalActions(state)
+
+        if len(legalActions) == 0:
+            return None
+
+        q_values = [self.getQValue(state, action) for action in legalActions]
+        q_max = max(q_values)
+
+        q_max_indices = []
+        for index, value in enumerate(q_values):
+            if value == q_max:
+                q_max_indices.append(index)
+
+        return legalActions[random.choice(q_max_indices)]
+
+        # util.raiseNotDefined()

    def getAction(self, state):
        """
@ -86,11 +114,21 @@ class QLearningAgent(ReinforcementAgent):
        """
        # Pick Action
        legalActions = self.getLegalActions(state)
-        action = None
-        "*** YOUR CODE HERE ***"
-        util.raiseNotDefined()

-        return action
+        "*** YOUR CODE HERE ***"
+        if len(legalActions) == 0:
+            return None
+
+        best_action = self.computeActionFromQValues(state)
+
+        if util.flipCoin(self.epsilon):
+            # Action aléatoire
+            return random.choice(legalActions)
+        else:
+            # Meilleure action
+            return best_action
+
+        # util.raiseNotDefined()

    def update(self, state, action, nextState, reward):
        """
@ -102,7 +140,12 @@ class QLearningAgent(ReinforcementAgent):
          it will be called on your behalf
        """
        "*** YOUR CODE HERE ***"
-        util.raiseNotDefined()
+        q_value = self.getQValue(state, action)
+        best_value = self.getValue(nextState)
+        new_q_value = (1-self.alpha)*q_value+self.alpha*(reward+self.discount*best_value)
+        self.q_values[(state, action)] = new_q_value
+        self.q_values.update({(state, action): new_q_value})
+        # util.raiseNotDefined()

    def getPolicy(self, state):
        return self.computeActionFromQValues(state)
--- a/reinforcement/valueIterationAgents.py
+++ b/reinforcement/valueIterationAgents.py
@ -50,7 +50,7 @@ class ValueIterationAgent(ValueEstimationAgent):
        "*** YOUR CODE HERE ***"
        states = self.mdp.getStates()

-        print "__init__ ... states: " + str(states)
+        # print "__init__ ... states: " + str(states)

        for i in range(iterations):
            # On reprend les valeurs de l'itération précédente comme référence
@ -107,9 +107,9 @@ class ValueIterationAgent(ValueEstimationAgent):
            return None

        q_values = [self.computeQValueFromValues(state, action) for action in possibleActions]
-        print "computeActionFromValues ... q_values: "+str(q_values)
-        print "index:"+str(q_values.index(max(q_values)))
-        print "action:"+str(possibleActions[q_values.index(max(q_values))])
+        # print "computeActionFromValues ... q_values: "+str(q_values)
+        # print "index:"+str(q_values.index(max(q_values)))
+        # print "action:"+str(possibleActions[q_values.index(max(q_values))])
        return possibleActions[q_values.index(max(q_values))]

    def getPolicy(self, state):