ift7025-tp1/reinforcement/learningAgents.py
2019-04-10 23:58:15 -04:00

258 lines
9.3 KiB
Python

# learningAgents.py
# -----------------
# Licensing Information: You are free to use or extend these projects for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
#
# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
# The core projects and autograders were primarily created by John DeNero
# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
# Student side autograding was added by Brad Miller, Nick Hay, and
# Pieter Abbeel (pabbeel@cs.berkeley.edu).
from game import Directions, Agent, Actions
import random,util,time
class ValueEstimationAgent(Agent):
"""
Abstract agent which assigns values to (state,action)
Q-Values for an environment. As well as a value to a
state and a policy given respectively by,
V(s) = max_{a in actions} Q(s,a)
policy(s) = arg_max_{a in actions} Q(s,a)
Both ValueIterationAgent and QLearningAgent inherit
from this agent. While a ValueIterationAgent has
a model of the environment via a MarkovDecisionProcess
(see mdp.py) that is used to estimate Q-Values before
ever actually acting, the QLearningAgent estimates
Q-Values while acting in the environment.
"""
def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10):
"""
Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,...
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
numTraining - number of training episodes, i.e. no learning after these many episodes
"""
self.alpha = float(alpha)
self.epsilon = float(epsilon)
self.discount = float(gamma)
self.numTraining = int(numTraining)
####################################
# Override These Functions #
####################################
def getQValue(self, state, action):
"""
Should return Q(state,action)
"""
util.raiseNotDefined()
def getValue(self, state):
"""
What is the value of this state under the best action?
Concretely, this is given by
V(s) = max_{a in actions} Q(s,a)
"""
util.raiseNotDefined()
def getPolicy(self, state):
"""
What is the best action to take in the state. Note that because
we might want to explore, this might not coincide with getAction
Concretely, this is given by
policy(s) = arg_max_{a in actions} Q(s,a)
If many actions achieve the maximal Q-value,
it doesn't matter which is selected.
"""
util.raiseNotDefined()
def getAction(self, state):
"""
state: can call state.getLegalActions()
Choose an action and return it.
"""
util.raiseNotDefined()
class ReinforcementAgent(ValueEstimationAgent):
"""
Abstract Reinforcemnt Agent: A ValueEstimationAgent
which estimates Q-Values (as well as policies) from experience
rather than a model
What you need to know:
- The environment will call
observeTransition(state,action,nextState,deltaReward),
which will call update(state, action, nextState, deltaReward)
which you should override.
- Use self.getLegalActions(state) to know which actions
are available in a state
"""
####################################
# Override These Functions #
####################################
def update(self, state, action, nextState, reward):
"""
This class will call this function, which you write, after
observing a transition and reward
"""
util.raiseNotDefined()
####################################
# Read These Functions #
####################################
def getLegalActions(self,state):
"""
Get the actions available for a given
state. This is what you should use to
obtain legal actions for a state
"""
return self.actionFn(state)
def observeTransition(self, state,action,nextState,deltaReward):
"""
Called by environment to inform agent that a transition has
been observed. This will result in a call to self.update
on the same arguments
NOTE: Do *not* override or call this function
"""
self.episodeRewards += deltaReward
self.update(state,action,nextState,deltaReward)
def startEpisode(self):
"""
Called by environment when new episode is starting
"""
self.lastState = None
self.lastAction = None
self.episodeRewards = 0.0
def stopEpisode(self):
"""
Called by environment when episode is done
"""
if self.episodesSoFar < self.numTraining:
self.accumTrainRewards += self.episodeRewards
else:
self.accumTestRewards += self.episodeRewards
self.episodesSoFar += 1
if self.episodesSoFar >= self.numTraining:
# Take off the training wheels
self.epsilon = 0.0 # no exploration
self.alpha = 0.0 # no learning
def isInTraining(self):
return self.episodesSoFar < self.numTraining
def isInTesting(self):
return not self.isInTraining()
def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1):
"""
actionFn: Function which takes a state and returns the list of legal actions
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
numTraining - number of training episodes, i.e. no learning after these many episodes
"""
if actionFn == None:
actionFn = lambda state: state.getLegalActions()
self.actionFn = actionFn
self.episodesSoFar = 0
self.accumTrainRewards = 0.0
self.accumTestRewards = 0.0
self.numTraining = int(numTraining)
self.epsilon = float(epsilon)
self.alpha = float(alpha)
self.discount = float(gamma)
################################
# Controls needed for Crawler #
################################
def setEpsilon(self, epsilon):
self.epsilon = epsilon
def setLearningRate(self, alpha):
self.alpha = alpha
def setDiscount(self, discount):
self.discount = discount
def doAction(self,state,action):
"""
Called by inherited class when
an action is taken in a state
"""
self.lastState = state
self.lastAction = action
###################
# Pacman Specific #
###################
def observationFunction(self, state):
"""
This is where we ended up after our last action.
The simulation should somehow ensure this is called
"""
if not self.lastState is None:
reward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, reward)
return state
def registerInitialState(self, state):
self.startEpisode()
if self.episodesSoFar == 0:
print 'Beginning %d episodes of Training' % (self.numTraining)
def final(self, state):
"""
Called by Pacman game at the terminal state
"""
deltaReward = state.getScore() - self.lastState.getScore()
self.observeTransition(self.lastState, self.lastAction, state, deltaReward)
self.stopEpisode()
# Make sure we have this var
if not 'episodeStartTime' in self.__dict__:
self.episodeStartTime = time.time()
if not 'lastWindowAccumRewards' in self.__dict__:
self.lastWindowAccumRewards = 0.0
self.lastWindowAccumRewards += state.getScore()
NUM_EPS_UPDATE = 100
if self.episodesSoFar % NUM_EPS_UPDATE == 0:
print 'Reinforcement Learning Status:'
windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
if self.episodesSoFar <= self.numTraining:
trainAvg = self.accumTrainRewards / float(self.episodesSoFar)
print '\tCompleted %d out of %d training episodes' % (
self.episodesSoFar,self.numTraining)
print '\tAverage Rewards over all training: %.2f' % (
trainAvg)
else:
testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining)
print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining)
print '\tAverage Rewards over testing: %.2f' % testAvg
print '\tAverage Rewards for last %d episodes: %.2f' % (
NUM_EPS_UPDATE,windowAvg)
print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime)
self.lastWindowAccumRewards = 0.0
self.episodeStartTime = time.time()
if self.episodesSoFar == self.numTraining:
msg = 'Training Done (turning off epsilon and alpha)'
print '%s\n%s' % (msg,'-' * len(msg))