Q-Learning with Battery Example

Purpose of this notebook is to demonstrate the ability of a reinforcement learning agent based on Q-Learning to learn to control a battery.

In [1]:
import os
import time

import numpy as np
import pandas as pd
import tensorflow as tf

from energy_py import EternityVisualizer
from energy_py.agents import DQN, Q_DQN
from energy_py.envs import BatteryEnv
In [2]:
#  set random seeds for repeatability
np.random.seed(42)
tf.set_random_seed(42)
In [3]:
#  let our environment know where our state & observation data is
data_path = os.getcwd()

#  keep all of the BatteryEnv variables (episode length, efficiency etc)
#  at their defaults
env = BatteryEnv(data_path)
In [4]:
#  define a batch size, discount rate and total number of episodes 
BATCH_SIZE = 32
DISCOUNT = 0.99
EPISODES = 5000

#  in order to setup hyperparameters like epsilon decay or target net
#  update frequency, we need to let our agent know how many total steps
#  it will take in it's life
total_steps = env.episode_length * EPISODES

#  now we setup our agent
#  we pass in an object to approximate Q(s,a)
#  this object is an energy_py function approximator that uses 
#  Tensorflow to estimate expected discounted return for each action
agent = DQN(env,
            discount=DISCOUNT,
            Q=Q_DQN,
            total_steps=total_steps,
            discrete_space_size=5)

#  we can have a look at the discretized action space
agent.actions
Out[4]:
array([[ 0. ,  0. ],
       [ 0. ,  0.5],
       [ 0. ,  1. ],
       [ 0. ,  1.5],
       [ 0. ,  2. ],
       [ 0.5,  0. ],
       [ 0.5,  0.5],
       [ 0.5,  1. ],
       [ 0.5,  1.5],
       [ 0.5,  2. ],
       [ 1. ,  0. ],
       [ 1. ,  0.5],
       [ 1. ,  1. ],
       [ 1. ,  1.5],
       [ 1. ,  2. ],
       [ 1.5,  0. ],
       [ 1.5,  0.5],
       [ 1.5,  1. ],
       [ 1.5,  1.5],
       [ 1.5,  2. ],
       [ 2. ,  0. ],
       [ 2. ,  0.5],
       [ 2. ,  1. ],
       [ 2. ,  1.5],
       [ 2. ,  2. ]])
In [5]:
#  simple class to time the experiment

class Timer(object):
    def __init__(self):
        self.start_time = time.time()
        
    def calc_time(self, episode, avg_reward):
        run_time = time.time() - self.start_time
        avg_time = run_time / episode
        print('{:1.0f} episodes in {:2.0f} min - avg {:.2f} sec per episode - avg lifetime reward {:3.3f} $/5min'.format(episode, run_time / 60, avg_time, avg_reward))
        return run_time, avg_time
In [6]:
#  we track total steps to know when to learn and 
#  update the target net
total_step = 0
timer = Timer()

#  initialize Tensorflow machinery
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    #  for loop over episodes
    for episode in range(1, EPISODES):
        #  initialize before starting episode
        done, step = False, 0
        #  reset the environment
        observation = env.reset()
        
        #  a while loop to run through a single episode
        #  will terminate when the env returns done=True after a step
        while done is False:
            #  agent uses observation to select an action
            action = agent.act(sess=sess, obs=observation)
            
            #  take a step through the environment
            next_observation, reward, done, info = env.step(action)
            
            #  store the experience in the agent memory
            agent.memory.add_experience(observation, action, reward,
                                       next_observation, done,
                                       step, episode)
            
            #  the DQN agent has an initial number of steps of
            #  no learning to fill it's memory
            #  if we are beyond this we learn at each step
            if total_step > agent.initial_random:
                #  get a batch of experience
                #  this is naive experience replay (no prioritization)
                batch = agent.memory.get_random_batch(BATCH_SIZE)
                
                #  learn using the batch
                training_info = agent.learn(sess=sess, batch=batch)
                
                #  optionally update the target network
                if total_step % agent.update_target_net == 0:
                    agent.update_target_network(sess)
                    
            #  move on to the next step
            step += 1
            total_step += 1
            observation = next_observation
            
        if episode % 200 == 0:
            avg_reward = np.mean(agent.memory.rewards)
            run_time, avg_time = timer.calc_time(episode, avg_reward)
            
#  one final run of the timer
run_time, avg_time = timer.calc_time(episode, avg_reward)
200 episodes in  0 min - avg 0.04 sec per episode - avg lifetime reward -0.452 $/5min
400 episodes in  0 min - avg 0.04 sec per episode - avg lifetime reward -0.477 $/5min
600 episodes in  1 min - avg 0.06 sec per episode - avg lifetime reward -0.466 $/5min
800 episodes in  1 min - avg 0.10 sec per episode - avg lifetime reward -0.364 $/5min
1000 episodes in  2 min - avg 0.12 sec per episode - avg lifetime reward -0.215 $/5min
1200 episodes in  3 min - avg 0.14 sec per episode - avg lifetime reward -0.031 $/5min
1400 episodes in  4 min - avg 0.16 sec per episode - avg lifetime reward 0.143 $/5min
1600 episodes in  5 min - avg 0.18 sec per episode - avg lifetime reward 0.314 $/5min
1800 episodes in  6 min - avg 0.19 sec per episode - avg lifetime reward 0.484 $/5min
2000 episodes in  7 min - avg 0.20 sec per episode - avg lifetime reward 0.651 $/5min
2200 episodes in  8 min - avg 0.22 sec per episode - avg lifetime reward 0.813 $/5min
2400 episodes in  9 min - avg 0.23 sec per episode - avg lifetime reward 0.969 $/5min
2600 episodes in 10 min - avg 0.24 sec per episode - avg lifetime reward 1.130 $/5min
2800 episodes in 12 min - avg 0.25 sec per episode - avg lifetime reward 1.281 $/5min
3000 episodes in 13 min - avg 0.26 sec per episode - avg lifetime reward 1.432 $/5min
3200 episodes in 15 min - avg 0.27 sec per episode - avg lifetime reward 1.570 $/5min
3400 episodes in 16 min - avg 0.29 sec per episode - avg lifetime reward 1.696 $/5min
3600 episodes in 18 min - avg 0.30 sec per episode - avg lifetime reward 1.807 $/5min
3800 episodes in 20 min - avg 0.31 sec per episode - avg lifetime reward 1.904 $/5min
4000 episodes in 21 min - avg 0.32 sec per episode - avg lifetime reward 1.990 $/5min
4200 episodes in 23 min - avg 0.33 sec per episode - avg lifetime reward 2.065 $/5min
4400 episodes in 25 min - avg 0.34 sec per episode - avg lifetime reward 2.139 $/5min
4600 episodes in 27 min - avg 0.36 sec per episode - avg lifetime reward 2.205 $/5min
4800 episodes in 29 min - avg 0.37 sec per episode - avg lifetime reward 2.265 $/5min
4999 episodes in 32 min - avg 0.38 sec per episode - avg lifetime reward 2.265 $/5min
In [7]:
#  now our experiment is over, we can look at the results
#  create an object to collect data and create figures
hist = EternityVisualizer(agent, env)

#  implement the functionality of the EternityVisualizer class
agent_outputs, env_outputs = hist.output_results(save_data=False)
/Users/adam/anaconda3/envs/energy_py/lib/python3.5/site-packages/matplotlib/pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [8]:
#  the Eternity Visualizer has created a number of figures from
#  the agent and environment
hist.figs_dict.keys()
Out[8]:
dict_keys(['new_charge', 'scaled_targets', 'loss', 'done', 'max_Q_acting estimates', 'train_error', 'unscaled_targets', 'max_scaled_target', 'losses', 'reward', 'old_charge', 'avg_Q_acting estimates', 'env_panel_fig', 'scaled_obs', 'steps', 'avg_scaled_target', 'gross_rate', 'Q act est', 'net_stored', 'reward_panel', 'epsilon', 'electricity_price'])
In [9]:
# jupyter magic command to get matplotlib to play nice 
%matplotlib inline  

#  the most useful figure is the reward panel, showing:
#  1 reward per episode (and max reward seen so far)
#  2 the mean total reward of the last 10% of episodes
#    shaded = the standard deviation (also last 10% of episodes)
hist.figs_dict['reward_panel']
Out[9]:
In [10]:
#  we can also take a look at a figure generated by the Battery env
#  showing the results of the last episode
#  1 - the gross rate of charge/discharge
#  2 - the charge level of the battery at the end of each step
#  3 - the electricity price
hist.figs_dict['env_panel_fig']
Out[10]:
In [11]:
#  we can see from the loss function where we update the target 
#  network parameters, as the loss spikes up
hist.figs_dict['loss']
Out[11]:
In [12]:
#  we can have a look at the epsilon decay schedule
hist.figs_dict['epsilon']
Out[12]: