REINFORCE with Baseline Policy Gradient Algorithm
The various baseline algorithms attempt to stabilise learning by subtracting the average expected return from the action-values, which leads to stable action-values. Contrast this to vanilla policy gradient or Q-learning algorithms that continuously increment the Q-value, which leads to situations where a minor incremental update to one of the actions causes vast changes in the policy.
In this workshop I will build upon the previous and also show you how to visualise the discounted reward over various states. I won’t talk about the elements discussed in previous tuorials.
A note on usage
Note that this notebook might not work on your machine because simple_rl forces TkAgg on some machines. See https://github.com/david-abel/simple_rl/issues/40
Also, Pygame is notoriously picky and expects loads of compiler/system related libraries.
I managed to get this working on the following notebook: docker run -it -p 8888:8888 jupyter/scipy-notebook:54462805efcb
This code is untested on any other notebook.
TODO: migrate away from simple rl and pygame. TODO: Create dedicated q-learning and sarsa notebooks.
!pip install pygame==1.9.6 pandas==1.0.5 matplotlib==3.2.1 gym==0.17.3 > /dev/null
!pip install --upgrade git+git://github.com/david-abel/simple_rl.git@77c0d6b910efbe8bdd5f4f87337c5bc4aed0d79c > /dev/null
import matplotlib
matplotlib.use("agg", force=True)
Running command git clone -q git://github.com/david-abel/simple_rl.git /tmp/pip-req-build-06wiz19s
Setup and Previous Agents
import gym
import random
import numpy as np
from simple_rl.tasks import GymMDP
# Gym MDP
gym_mdp = GymMDP(env_name="CartPole-v1", render=False)
num_feats = gym_mdp.get_num_state_feats()
GLOBAL_SEED = 0
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
gym_mdp.env.seed(GLOBAL_SEED)
from simple_rl.agents import PolicyGradientAgent
class LogisticPolicyAgent(PolicyGradientAgent):
def __init__(self, actions, num_feats):
self.α = 0.01
self.γ = 0.99
self.num_feats = num_feats
PolicyGradientAgent.__init__(
self, name="logistic_policy_gradient", actions=actions
)
self.reset()
@staticmethod
def logistic(x):
return 1 / (1 + np.exp(-x))
@staticmethod
def π(θ, s):
π = LogisticPolicyAgent.logistic(np.dot(θ.T, s))
return np.array([π, 1 - π])
@staticmethod
def Δ(θ, s):
π = LogisticPolicyAgent.logistic(np.dot(θ.T, s))
return np.array([s - s * π, -s * π])
def act(self, state, reward):
if self.previous_pair is not None:
self.episode_history.append(Step(self.previous_pair, reward))
π = LogisticPolicyAgent.π(self.θ, state)
action = np.random.choice((0, 1), p=π)
self.previous_pair = Pair(state.data, action)
return action
def reset(self):
self.θ = np.zeros(self.num_feats)
self.end_of_episode()
PolicyGradientAgent.reset(self)
def end_of_episode(self):
T = len(getattr(self, "episode_history", []))
G = 0
grad_buf = []
for t in reversed(range(T)):
G = G * self.γ + self.episode_history[t].reward
grad = LogisticPolicyAgent.Δ(self.θ, self.episode_history[t].pair.state)[
self.episode_history[t].pair.action
]
self.θ += self.α * np.power(self.γ, t) * grad * G
grad_buf.append(np.power(self.γ, t) * grad * G)
reinforce_gradient_buffer.append(
[np.mean(np.abs(grad_buf)), np.std(grad_buf)])
self.episode_history = []
self.previous_pair = None
PolicyGradientAgent.end_of_episode(self)
Reinforce with Baseline Agent
The new agent is basically the same as the standard REINFORCE agent, except for the addition of a weighted moving average to estimate the baseline, which is subtracted from the expected return.
class LogisticPolicyAgentWithBaseline(PolicyGradientAgent):
def __init__(self, actions, num_feats, α_θ=0.01, prefix=""):
self.α_θ = α_θ
self.α_w = 0.1
self.γ = 0.99
self.num_feats = num_feats
PolicyGradientAgent.__init__(
self, name=prefix + "logistic_policy_gradient_with_baseline", actions=actions
)
self.reset()
def act(self, state, reward):
if self.previous_pair is not None:
self.episode_history.append(Step(self.previous_pair, reward))
π = LogisticPolicyAgent.π(self.θ, state)
action = np.random.choice((0, 1), p=π)
self.previous_pair = Pair(state.data, action)
self.t += 1
return action
def reset(self):
self.θ = np.zeros(self.num_feats)
self.w = 0
self.end_of_episode()
PolicyGradientAgent.reset(self)
def end_of_episode(self):
self.t = 0
T = len(getattr(self, "episode_history", []))
G = 0
grad_buf = []
for t in reversed(range(T)):
G = G * self.γ + self.episode_history[t].reward
δ = G - self.w
global_buffer.append(
np.concatenate(
[self.episode_history[t].pair.state, np.array([G])])
)
self.w += self.α_w * δ
Δπ = LogisticPolicyAgent.Δ(self.θ, self.episode_history[t].pair.state)[
self.episode_history[t].pair.action
]
self.θ += self.α_θ * np.power(self.γ, t) * δ * Δπ
grad_buf.append(np.power(self.γ, t) * δ * Δπ)
baseline_gradient_buffer.append(
[np.mean(np.abs(grad_buf)), np.std(grad_buf)])
self.episode_history = []
self.previous_pair = None
PolicyGradientAgent.end_of_episode(self)
Training the Agent
Now I’m ready to run the experiment to train the agent. You might want to play around with the instances
parameter, which controls the number of repeats to average over.
from simple_rl.run_experiments import run_agents_on_mdp
from collections import namedtuple
Step = namedtuple("Step", ["pair", "reward"])
Pair = namedtuple("Pair", ["state", "action"])
reinforce_gradient_buffer = []
REINFORCE = LogisticPolicyAgent(gym_mdp.get_actions(), num_feats)
run_agents_on_mdp(
[REINFORCE],
gym_mdp,
instances=2,
episodes=500,
steps=1000,
open_plot=False,
verbose=False,
cumulative_plot=False,
)
np.savetxt("gradient_REINFORCE.txt", np.array(reinforce_gradient_buffer))
global_buffer = []
baseline_gradient_buffer = []
REINFORCE_baseline = LogisticPolicyAgentWithBaseline(
gym_mdp.get_actions(), num_feats
)
run_agents_on_mdp(
[REINFORCE_baseline],
gym_mdp,
instances=2,
episodes=500,
steps=1000,
open_plot=False,
verbose=False,
cumulative_plot=False,
)
np.savetxt("state_reward_REINFORCE_baseline.txt", np.array(global_buffer))
np.savetxt("gradient_REINFORCE_baseline.txt",
np.array(baseline_gradient_buffer))
Running experiment:
(MDP)
gym-CartPole-v1
(Agents)
logistic_policy_gradient,0
(Params)
instances : 2
episodes : 500
steps : 1000
track_disc_reward : False
logistic_policy_gradient is learning.
Instance 1 of 2.
/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.
out=out, **kwargs)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:209: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
Instance 2 of 2.
--- TIMES ---
logistic_policy_gradient agent took 119.83 seconds.
-------------
logistic_policy_gradient: 176.0 (conf_interv: 67.91 )
Running experiment:
(MDP)
gym-CartPole-v1
(Agents)
logistic_policy_gradient_with_baseline,0
(Params)
instances : 2
episodes : 500
steps : 1000
track_disc_reward : False
logistic_policy_gradient_with_baseline is learning.
Instance 1 of 2.
/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.
out=out, **kwargs)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:209: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)
Instance 2 of 2.
--- TIMES ---
logistic_policy_gradient_with_baseline agent took 155.76 seconds.
-------------
logistic_policy_gradient_with_baseline: 397.0 (conf_interv: 40.19 )
Plotting the Episode Return
The baseline algorithm needs the ability to predict the return for a given state. This means you need a representative model to be able to predict that. I tried a few things, but quickly realised that the data are remarkably complex. The code below takes some trajectories, runs principal component analysis and plots the result.
Remember that there are four features, so the space is more complex than this. But you can quickly see that the data is strange. It’s shaped like a triangle. This is primarily because the next return depends on the previous; it’s cumulative. Therefore, I thought that I could use a simple online rolling average of previously observed rewards.
%matplotlib inline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
data = np.loadtxt("state_reward_REINFORCE_baseline.txt")
x = data[:, 0:4]
pca = PCA(n_components=1)
x = pca.fit_transform(x)
y = data[:, 4]
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.plot(x, y, marker='.', alpha=0.1, markeredgewidth=0.0)
ax.set_xlabel('First Principal Component of State')
ax.set_ylabel('Discounted Reward')
plt.show()
Plotting the Results
The following will read from the saved results.
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, AutoMinorLocator
import matplotlib as mpl
import json
import os
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
import subprocess
def plot(experiment_name, data_files, cutoff=None):
fig, ax = plt.subplots(nrows=1, ncols=1)
for j, (name, data_file) in enumerate(data_files):
df = pd.read_csv(data_file, header=None).transpose()
if cutoff:
df = df.truncate(after=cutoff)
x = df.index.values
y = df.values
if len(y.shape) > 1:
y = y.mean(axis=1)
ax.plot(x,
y,
label=name)
ax.set_xlabel('Epoch')
ax.set_ylabel('Average Reward (10 runs)')
ax.legend(loc='lower right')
plt.show()
data_files = [
("REINFORCE with baseline (logistic)", "results/gym-CartPole-v1/logistic_policy_gradient_with_baseline.csv"),
("REINFORCE (logistic)", "results/gym-CartPole-v1/logistic_policy_gradient.csv"),
]
plot("reinforce_reward_plot", data_files, cutoff=500)
Discussion
You can see that the baseline algorithm performs better, just, than the basic REINFORCE algorithm. If you performed more repeats, this would become clearer.
I encourage you to try this yourself with a single repeat. You should be able to observe the baseline algorithm increasing in performance more consistently than the standard algorithm, which seems to repeatedly drop back to zero. This is because of subtle, but catastrophic updates to the Q-values. Sometimes they nudge the policies towards bad policies.