From 77caa10f89ebc65d13ff15c61f655c39d193f96f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Sat, 13 Apr 2019 00:43:54 -0400 Subject: [PATCH] =?UTF-8?q?q=20learning=20pas=20mal=20avanc=C3=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- reinforcement/analysis.py | 35 +++++++-------- reinforcement/qlearningAgents.py | 61 +++++++++++++++++++++++---- reinforcement/valueIterationAgents.py | 8 ++-- 3 files changed, 74 insertions(+), 30 deletions(-) diff --git a/reinforcement/analysis.py b/reinforcement/analysis.py index ff5ecb0..497e5d2 100644 --- a/reinforcement/analysis.py +++ b/reinforcement/analysis.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # analysis.py # ----------- # Licensing Information: You are free to use or extend these projects for @@ -25,45 +26,45 @@ def question2(): return answerDiscount, answerNoise def question3a(): - answerDiscount = None - answerNoise = None - answerLivingReward = None + answerDiscount = 0.9 + answerNoise = 0.1 + answerLivingReward = -3 return answerDiscount, answerNoise, answerLivingReward # If not possible, return 'NOT POSSIBLE' def question3b(): - answerDiscount = None - answerNoise = None - answerLivingReward = None + answerDiscount = 0.1 + answerNoise = 0.1 + answerLivingReward = -2 return answerDiscount, answerNoise, answerLivingReward # If not possible, return 'NOT POSSIBLE' def question3c(): - answerDiscount = None - answerNoise = None - answerLivingReward = None + answerDiscount = 0.9 + answerNoise = 0 + answerLivingReward = 0 return answerDiscount, answerNoise, answerLivingReward # If not possible, return 'NOT POSSIBLE' def question3d(): - answerDiscount = None - answerNoise = None - answerLivingReward = None + answerDiscount = 0.1 + answerNoise = 0.1 + answerLivingReward = 1 return answerDiscount, answerNoise, answerLivingReward # If not possible, return 'NOT POSSIBLE' def question3e(): - answerDiscount = None - answerNoise = None - answerLivingReward = None + answerDiscount = 0 + answerNoise = 0 + answerLivingReward = 1 return answerDiscount, answerNoise, answerLivingReward # If not possible, return 'NOT POSSIBLE' def question6(): answerEpsilon = None answerLearningRate = None - return answerEpsilon, answerLearningRate - # If not possible, return 'NOT POSSIBLE' + # return answerEpsilon, answerLearningRate + return 'NOT POSSIBLE' if __name__ == '__main__': print 'Answers to analysis questions:' diff --git a/reinforcement/qlearningAgents.py b/reinforcement/qlearningAgents.py index a417a5f..5c13b2c 100644 --- a/reinforcement/qlearningAgents.py +++ b/reinforcement/qlearningAgents.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # qlearningAgents.py # ------------------ # Licensing Information: You are free to use or extend these projects for @@ -43,6 +44,7 @@ class QLearningAgent(ReinforcementAgent): ReinforcementAgent.__init__(self, **args) "*** YOUR CODE HERE ***" + self.q_values = {} def getQValue(self, state, action): """ @@ -51,8 +53,11 @@ class QLearningAgent(ReinforcementAgent): or the Q node value otherwise """ "*** YOUR CODE HERE ***" - util.raiseNotDefined() - + if (state, action) in self.q_values: + return self.q_values.get((state, action)) + else: + return 0 + # util.raiseNotDefined() def computeValueFromQValues(self, state): """ @@ -62,7 +67,14 @@ class QLearningAgent(ReinforcementAgent): terminal state, you should return a value of 0.0. """ "*** YOUR CODE HERE ***" - util.raiseNotDefined() + legal_actions = self.getLegalActions(state) + + if len(legal_actions) == 0: + return 0.0 + + return max([self.getQValue(state, action) for action in legal_actions]) + + # util.raiseNotDefined() def computeActionFromQValues(self, state): """ @@ -71,7 +83,23 @@ class QLearningAgent(ReinforcementAgent): you should return None. """ "*** YOUR CODE HERE ***" - util.raiseNotDefined() + + legalActions = self.getLegalActions(state) + + if len(legalActions) == 0: + return None + + q_values = [self.getQValue(state, action) for action in legalActions] + q_max = max(q_values) + + q_max_indices = [] + for index, value in enumerate(q_values): + if value == q_max: + q_max_indices.append(index) + + return legalActions[random.choice(q_max_indices)] + + # util.raiseNotDefined() def getAction(self, state): """ @@ -86,11 +114,21 @@ class QLearningAgent(ReinforcementAgent): """ # Pick Action legalActions = self.getLegalActions(state) - action = None - "*** YOUR CODE HERE ***" - util.raiseNotDefined() - return action + "*** YOUR CODE HERE ***" + if len(legalActions) == 0: + return None + + best_action = self.computeActionFromQValues(state) + + if util.flipCoin(self.epsilon): + # Action aléatoire + return random.choice(legalActions) + else: + # Meilleure action + return best_action + + # util.raiseNotDefined() def update(self, state, action, nextState, reward): """ @@ -102,7 +140,12 @@ class QLearningAgent(ReinforcementAgent): it will be called on your behalf """ "*** YOUR CODE HERE ***" - util.raiseNotDefined() + q_value = self.getQValue(state, action) + best_value = self.getValue(nextState) + new_q_value = (1-self.alpha)*q_value+self.alpha*(reward+self.discount*best_value) + self.q_values[(state, action)] = new_q_value + self.q_values.update({(state, action): new_q_value}) + # util.raiseNotDefined() def getPolicy(self, state): return self.computeActionFromQValues(state) diff --git a/reinforcement/valueIterationAgents.py b/reinforcement/valueIterationAgents.py index f56bd51..bdfe560 100644 --- a/reinforcement/valueIterationAgents.py +++ b/reinforcement/valueIterationAgents.py @@ -50,7 +50,7 @@ class ValueIterationAgent(ValueEstimationAgent): "*** YOUR CODE HERE ***" states = self.mdp.getStates() - print "__init__ ... states: " + str(states) + # print "__init__ ... states: " + str(states) for i in range(iterations): # On reprend les valeurs de l'itération précédente comme référence @@ -107,9 +107,9 @@ class ValueIterationAgent(ValueEstimationAgent): return None q_values = [self.computeQValueFromValues(state, action) for action in possibleActions] - print "computeActionFromValues ... q_values: "+str(q_values) - print "index:"+str(q_values.index(max(q_values))) - print "action:"+str(possibleActions[q_values.index(max(q_values))]) + # print "computeActionFromValues ... q_values: "+str(q_values) + # print "index:"+str(q_values.index(max(q_values))) + # print "action:"+str(possibleActions[q_values.index(max(q_values))]) return possibleActions[q_values.index(max(q_values))] def getPolicy(self, state):