REINFORCE with Baseline Policy Gradient Algorithm

A notebook investigating the REINFORCE with baseline policy gradient algorithm.

REINFORCE with Baseline Policy Gradient Algorithm

The various baseline algorithms attempt to stabilise learning by subtracting the average expected return from the action-values, which leads to stable action-values. Contrast this to vanilla policy gradient or Q-learning algorithms that continuously increment the Q-value, which leads to situations where a minor incremental update to one of the actions causes vast changes in the policy.

In this workshop I will build upon the previous and also show you how to visualise the discounted reward over various states. I won’t talk about the elements discussed in previous tuorials.

A note on usage

Note that this notebook might not work on your machine because simple_rl forces TkAgg on some machines. See https://github.com/david-abel/simple_rl/issues/40

Also, Pygame is notoriously picky and expects loads of compiler/system related libraries.

I managed to get this working on the following notebook: docker run -it -p 8888:8888 jupyter/scipy-notebook:54462805efcb

This code is untested on any other notebook.

TODO: migrate away from simple rl and pygame. TODO: Create dedicated q-learning and sarsa notebooks.

!pip install pygame==1.9.6 pandas==1.0.5 matplotlib==3.2.1 gym==0.17.3 > /dev/null
!pip install --upgrade git+git://github.com/david-abel/simple_rl.git@77c0d6b910efbe8bdd5f4f87337c5bc4aed0d79c > /dev/null
import matplotlib
matplotlib.use("agg", force=True)
  Running command git clone -q git://github.com/david-abel/simple_rl.git /tmp/pip-req-build-06wiz19s

Setup and Previous Agents

import gym
import random
import numpy as np
from simple_rl.tasks import GymMDP


# Gym MDP
gym_mdp = GymMDP(env_name="CartPole-v1", render=False)
num_feats = gym_mdp.get_num_state_feats()

GLOBAL_SEED = 0
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
gym_mdp.env.seed(GLOBAL_SEED)

from simple_rl.agents import PolicyGradientAgent

class LogisticPolicyAgent(PolicyGradientAgent):
    def __init__(self, actions, num_feats):
        self.α = 0.01
        self.γ = 0.99
        self.num_feats = num_feats
        PolicyGradientAgent.__init__(
            self, name="logistic_policy_gradient", actions=actions
        )
        self.reset()

    @staticmethod
    def logistic(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def π(θ, s):
        π = LogisticPolicyAgent.logistic(np.dot(θ.T, s))
        return np.array([π, 1 - π])

    @staticmethod
    def Δ(θ, s):
        π = LogisticPolicyAgent.logistic(np.dot(θ.T, s))
        return np.array([s - s * π, -s * π])

    def act(self, state, reward):
        if self.previous_pair is not None:
            self.episode_history.append(Step(self.previous_pair, reward))
        π = LogisticPolicyAgent.π(self.θ, state)
        action = np.random.choice((0, 1), p=π)
        self.previous_pair = Pair(state.data, action)
        return action

    def reset(self):
        self.θ = np.zeros(self.num_feats)
        self.end_of_episode()
        PolicyGradientAgent.reset(self)

    def end_of_episode(self):
        T = len(getattr(self, "episode_history", []))
        G = 0
        grad_buf = []
        for t in reversed(range(T)):
            G = G * self.γ + self.episode_history[t].reward
            grad = LogisticPolicyAgent.Δ(self.θ, self.episode_history[t].pair.state)[
                self.episode_history[t].pair.action
            ]
            self.θ += self.α * np.power(self.γ, t) * grad * G
            grad_buf.append(np.power(self.γ, t) * grad * G)
        reinforce_gradient_buffer.append(
            [np.mean(np.abs(grad_buf)), np.std(grad_buf)])
        self.episode_history = []
        self.previous_pair = None
        PolicyGradientAgent.end_of_episode(self)

Reinforce with Baseline Agent

The new agent is basically the same as the standard REINFORCE agent, except for the addition of a weighted moving average to estimate the baseline, which is subtracted from the expected return.


class LogisticPolicyAgentWithBaseline(PolicyGradientAgent):
    def __init__(self, actions, num_feats, α=0.01, prefix=""):
        self.α= α_θ
        self.α_w = 0.1
        self.γ = 0.99
        self.num_feats = num_feats
        PolicyGradientAgent.__init__(
            self, name=prefix + "logistic_policy_gradient_with_baseline", actions=actions
        )
        self.reset()

    def act(self, state, reward):
        if self.previous_pair is not None:
            self.episode_history.append(Step(self.previous_pair, reward))
        π = LogisticPolicyAgent.π(self.θ, state)
        action = np.random.choice((0, 1), p=π)
        self.previous_pair = Pair(state.data, action)
        self.t += 1
        return action

    def reset(self):
        self.θ = np.zeros(self.num_feats)
        self.w = 0
        self.end_of_episode()
        PolicyGradientAgent.reset(self)

    def end_of_episode(self):
        self.t = 0
        T = len(getattr(self, "episode_history", []))
        G = 0
        grad_buf = []
        for t in reversed(range(T)):
            G = G * self.γ + self.episode_history[t].reward
            δ = G - self.w
            global_buffer.append(
                np.concatenate(
                    [self.episode_history[t].pair.state, np.array([G])])
            )
            self.w += self.α_w * δ
            Δπ = LogisticPolicyAgent.Δ(self.θ, self.episode_history[t].pair.state)[
                self.episode_history[t].pair.action
            ]
            self.θ += self.α* np.power(self.γ, t) * δ * Δπ
            grad_buf.append(np.power(self.γ, t) * δ * Δπ)
        baseline_gradient_buffer.append(
            [np.mean(np.abs(grad_buf)), np.std(grad_buf)])
        self.episode_history = []
        self.previous_pair = None
        PolicyGradientAgent.end_of_episode(self)

Training the Agent

Now I’m ready to run the experiment to train the agent. You might want to play around with the instances parameter, which controls the number of repeats to average over.

from simple_rl.run_experiments import run_agents_on_mdp
from collections import namedtuple

Step = namedtuple("Step", ["pair", "reward"])
Pair = namedtuple("Pair", ["state", "action"])

reinforce_gradient_buffer = []
REINFORCE = LogisticPolicyAgent(gym_mdp.get_actions(), num_feats)
run_agents_on_mdp(
    [REINFORCE],
    gym_mdp,
    instances=2,
    episodes=500,
    steps=1000,
    open_plot=False,
    verbose=False,
    cumulative_plot=False,
)
np.savetxt("gradient_REINFORCE.txt", np.array(reinforce_gradient_buffer))

global_buffer = []
baseline_gradient_buffer = []
REINFORCE_baseline = LogisticPolicyAgentWithBaseline(
    gym_mdp.get_actions(), num_feats
)
run_agents_on_mdp(
    [REINFORCE_baseline],
    gym_mdp,
    instances=2,
    episodes=500,
    steps=1000,
    open_plot=False,
    verbose=False,
    cumulative_plot=False,
)
np.savetxt("state_reward_REINFORCE_baseline.txt", np.array(global_buffer))
np.savetxt("gradient_REINFORCE_baseline.txt",
            np.array(baseline_gradient_buffer))
Running experiment: 
(MDP)
    gym-CartPole-v1
(Agents)
    logistic_policy_gradient,0
(Params)
    instances : 2
    episodes : 500
    steps : 1000
    track_disc_reward : False

logistic_policy_gradient is learning.
  Instance 1 of 2.
/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:209: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
  Instance 2 of 2.


--- TIMES ---
logistic_policy_gradient agent took 119.83 seconds.
-------------

    logistic_policy_gradient: 176.0 (conf_interv: 67.91 )
Running experiment: 
(MDP)
    gym-CartPole-v1
(Agents)
    logistic_policy_gradient_with_baseline,0
(Params)
    instances : 2
    episodes : 500
    steps : 1000
    track_disc_reward : False

logistic_policy_gradient_with_baseline is learning.
  Instance 1 of 2.
/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:217: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:186: RuntimeWarning: invalid value encountered in true_divide
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:209: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
  Instance 2 of 2.


--- TIMES ---
logistic_policy_gradient_with_baseline agent took 155.76 seconds.
-------------

    logistic_policy_gradient_with_baseline: 397.0 (conf_interv: 40.19 )

Plotting the Episode Return

The baseline algorithm needs the ability to predict the return for a given state. This means you need a representative model to be able to predict that. I tried a few things, but quickly realised that the data are remarkably complex. The code below takes some trajectories, runs principal component analysis and plots the result.

Remember that there are four features, so the space is more complex than this. But you can quickly see that the data is strange. It’s shaped like a triangle. This is primarily because the next return depends on the previous; it’s cumulative. Therefore, I thought that I could use a simple online rolling average of previously observed rewards.

%matplotlib inline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

data = np.loadtxt("state_reward_REINFORCE_baseline.txt")

x = data[:, 0:4]
pca = PCA(n_components=1)
x = pca.fit_transform(x)
y = data[:, 4]

fig, ax = plt.subplots(nrows=1, ncols=1)
plt.plot(x, y, marker='.', alpha=0.1, markeredgewidth=0.0)
ax.set_xlabel('First Principal Component of State')
ax.set_ylabel('Discounted Reward')
plt.show()

Plotting the Results

The following will read from the saved results.

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, AutoMinorLocator
import matplotlib as mpl
import json
import os
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
import subprocess

def plot(experiment_name, data_files, cutoff=None):
    fig, ax = plt.subplots(nrows=1, ncols=1)
    for j, (name, data_file) in enumerate(data_files):
        df = pd.read_csv(data_file, header=None).transpose()
        if cutoff:
            df = df.truncate(after=cutoff)
        x = df.index.values
        y = df.values
        if len(y.shape) > 1:
            y = y.mean(axis=1)
        ax.plot(x,
                y,
                label=name)

    ax.set_xlabel('Epoch')
    ax.set_ylabel('Average Reward (10 runs)')
    ax.legend(loc='lower right')
    plt.show()

data_files = [
    ("REINFORCE with baseline (logistic)", "results/gym-CartPole-v1/logistic_policy_gradient_with_baseline.csv"),
    ("REINFORCE (logistic)", "results/gym-CartPole-v1/logistic_policy_gradient.csv"),
]
plot("reinforce_reward_plot", data_files, cutoff=500)

Discussion

You can see that the baseline algorithm performs better, just, than the basic REINFORCE algorithm. If you performed more repeats, this would become clearer.

I encourage you to try this yourself with a single repeat. You should be able to observe the baseline algorithm increasing in performance more consistently than the standard algorithm, which seems to repeatedly drop back to zero. This is because of subtle, but catastrophic updates to the Q-values. Sometimes they nudge the policies towards bad policies.