124 lines
4.6 KiB
Python
124 lines
4.6 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# valueIterationAgents.py
|
||
|
# -----------------------
|
||
|
# Licensing Information: You are free to use or extend these projects for
|
||
|
# educational purposes provided that (1) you do not distribute or publish
|
||
|
# solutions, (2) you retain this notice, and (3) you provide clear
|
||
|
# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
|
||
|
#
|
||
|
# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
|
||
|
# The core projects and autograders were primarily created by John DeNero
|
||
|
# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
|
||
|
# Student side autograding was added by Brad Miller, Nick Hay, and
|
||
|
# Pieter Abbeel (pabbeel@cs.berkeley.edu).
|
||
|
|
||
|
|
||
|
import mdp, util
|
||
|
|
||
|
from learningAgents import ValueEstimationAgent
|
||
|
|
||
|
|
||
|
class ValueIterationAgent(ValueEstimationAgent):
|
||
|
"""
|
||
|
* Please read learningAgents.py before reading this.*
|
||
|
|
||
|
A ValueIterationAgent takes a Markov decision process
|
||
|
(see mdp.py) on initialization and runs value iteration
|
||
|
for a given number of iterations using the supplied
|
||
|
discount factor.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, mdp, discount=0.9, iterations=100):
|
||
|
"""
|
||
|
Your value iteration agent should take an mdp on
|
||
|
construction, run the indicated number of iterations
|
||
|
and then act according to the resulting policy.
|
||
|
|
||
|
Some useful mdp methods you will use:
|
||
|
mdp.getStates()
|
||
|
mdp.getPossibleActions(state)
|
||
|
mdp.getTransitionStatesAndProbs(state, action)
|
||
|
mdp.getReward(state, action, nextState)
|
||
|
mdp.isTerminal(state)
|
||
|
"""
|
||
|
self.mdp = mdp
|
||
|
self.discount = discount
|
||
|
self.iterations = iterations
|
||
|
self.values = util.Counter() # A Counter is a dict with default 0
|
||
|
|
||
|
# Write value iteration code here
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
states = self.mdp.getStates()
|
||
|
|
||
|
print "__init__ ... states: " + str(states)
|
||
|
|
||
|
for i in range(iterations):
|
||
|
# On reprend les valeurs de l'itération précédente comme référence
|
||
|
# Copie pour batch
|
||
|
q_copy = self.values.copy()
|
||
|
for state in states:
|
||
|
q_new = None
|
||
|
for action in self.mdp.getPossibleActions(state):
|
||
|
q = self.computeQValueFromValues(state, action)
|
||
|
# Garder la meilleure Q value
|
||
|
if q_new is None or q_new < q:
|
||
|
q_new = q
|
||
|
# Gérer le cas sans successeurs
|
||
|
if q_new is None:
|
||
|
q_copy[state] = 0
|
||
|
else:
|
||
|
q_copy[state] = q_new
|
||
|
# On met à jour pout les prochaines itérations
|
||
|
self.values = q_copy
|
||
|
|
||
|
def getValue(self, state):
|
||
|
"""
|
||
|
Return the value of the state (computed in __init__).
|
||
|
"""
|
||
|
return self.values[state]
|
||
|
|
||
|
def computeQValueFromValues(self, state, action):
|
||
|
"""
|
||
|
Compute the Q-value of action in state from the
|
||
|
value function stored in self.values.
|
||
|
"""
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
values = []
|
||
|
for nextState, prob in self.mdp.getTransitionStatesAndProbs(state,action):
|
||
|
reward = self.mdp.getReward(state, action, nextState)
|
||
|
discount = self.discount
|
||
|
next_state_value = self.values[nextState]
|
||
|
values.append(prob*(reward+discount*next_state_value))
|
||
|
return sum(values)
|
||
|
|
||
|
def computeActionFromValues(self, state):
|
||
|
"""
|
||
|
The policy is the best action in the given state
|
||
|
according to the values currently stored in self.values.
|
||
|
|
||
|
You may break ties any way you see fit. Note that if
|
||
|
there are no legal actions, which is the case at the
|
||
|
terminal state, you should return None.
|
||
|
"""
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
possibleActions = self.mdp.getPossibleActions(state)
|
||
|
|
||
|
if len(possibleActions) == 0:
|
||
|
return None
|
||
|
|
||
|
q_values = [self.computeQValueFromValues(state, action) for action in possibleActions]
|
||
|
print "computeActionFromValues ... q_values: "+str(q_values)
|
||
|
print "index:"+str(q_values.index(max(q_values)))
|
||
|
print "action:"+str(possibleActions[q_values.index(max(q_values))])
|
||
|
return possibleActions[q_values.index(max(q_values))]
|
||
|
|
||
|
def getPolicy(self, state):
|
||
|
return self.computeActionFromValues(state)
|
||
|
|
||
|
def getAction(self, state):
|
||
|
"Returns the policy at the state (no exploration)."
|
||
|
return self.computeActionFromValues(state)
|
||
|
|
||
|
def getQValue(self, state, action):
|
||
|
return self.computeQValueFromValues(state, action)
|