# Q-Learning with Battery Example¶

Purpose of this notebook is to demonstrate the ability of a reinforcement learning agent based on Q-Learning to learn to control a battery.

In [1]:
import os
import time

import numpy as np
import pandas as pd
import tensorflow as tf

from energy_py import EternityVisualizer
from energy_py.agents import DQN, Q_DQN
from energy_py.envs import BatteryEnv

In [2]:
#  set random seeds for repeatability
np.random.seed(42)
tf.set_random_seed(42)

In [3]:
#  let our environment know where our state & observation data is
data_path = os.getcwd()

#  keep all of the BatteryEnv variables (episode length, efficiency etc)
#  at their defaults
env = BatteryEnv(data_path)

In [4]:
#  define a batch size, discount rate and total number of episodes
BATCH_SIZE = 32
DISCOUNT = 0.99
EPISODES = 5000

#  in order to setup hyperparameters like epsilon decay or target net
#  update frequency, we need to let our agent know how many total steps
#  it will take in it's life
total_steps = env.episode_length * EPISODES

#  now we setup our agent
#  we pass in an object to approximate Q(s,a)
#  this object is an energy_py function approximator that uses
#  Tensorflow to estimate expected discounted return for each action
agent = DQN(env,
discount=DISCOUNT,
Q=Q_DQN,
total_steps=total_steps,
discrete_space_size=5)

#  we can have a look at the discretized action space
agent.actions

Out[4]:
array([[ 0. ,  0. ],
[ 0. ,  0.5],
[ 0. ,  1. ],
[ 0. ,  1.5],
[ 0. ,  2. ],
[ 0.5,  0. ],
[ 0.5,  0.5],
[ 0.5,  1. ],
[ 0.5,  1.5],
[ 0.5,  2. ],
[ 1. ,  0. ],
[ 1. ,  0.5],
[ 1. ,  1. ],
[ 1. ,  1.5],
[ 1. ,  2. ],
[ 1.5,  0. ],
[ 1.5,  0.5],
[ 1.5,  1. ],
[ 1.5,  1.5],
[ 1.5,  2. ],
[ 2. ,  0. ],
[ 2. ,  0.5],
[ 2. ,  1. ],
[ 2. ,  1.5],
[ 2. ,  2. ]])
In [5]:
#  simple class to time the experiment

class Timer(object):
def __init__(self):
self.start_time = time.time()

def calc_time(self, episode, avg_reward):
run_time = time.time() - self.start_time
avg_time = run_time / episode
print('{:1.0f} episodes in {:2.0f} min - avg {:.2f} sec per episode - avg lifetime reward {:3.3f} $/5min'.format(episode, run_time / 60, avg_time, avg_reward)) return run_time, avg_time  In [6]: # we track total steps to know when to learn and # update the target net total_step = 0 timer = Timer() # initialize Tensorflow machinery with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # for loop over episodes for episode in range(1, EPISODES): # initialize before starting episode done, step = False, 0 # reset the environment observation = env.reset() # a while loop to run through a single episode # will terminate when the env returns done=True after a step while done is False: # agent uses observation to select an action action = agent.act(sess=sess, obs=observation) # take a step through the environment next_observation, reward, done, info = env.step(action) # store the experience in the agent memory agent.memory.add_experience(observation, action, reward, next_observation, done, step, episode) # the DQN agent has an initial number of steps of # no learning to fill it's memory # if we are beyond this we learn at each step if total_step > agent.initial_random: # get a batch of experience # this is naive experience replay (no prioritization) batch = agent.memory.get_random_batch(BATCH_SIZE) # learn using the batch training_info = agent.learn(sess=sess, batch=batch) # optionally update the target network if total_step % agent.update_target_net == 0: agent.update_target_network(sess) # move on to the next step step += 1 total_step += 1 observation = next_observation if episode % 200 == 0: avg_reward = np.mean(agent.memory.rewards) run_time, avg_time = timer.calc_time(episode, avg_reward) # one final run of the timer run_time, avg_time = timer.calc_time(episode, avg_reward)  200 episodes in 0 min - avg 0.04 sec per episode - avg lifetime reward -0.452$/5min
400 episodes in  0 min - avg 0.04 sec per episode - avg lifetime reward -0.477 $/5min 600 episodes in 1 min - avg 0.06 sec per episode - avg lifetime reward -0.466$/5min
800 episodes in  1 min - avg 0.10 sec per episode - avg lifetime reward -0.364 $/5min 1000 episodes in 2 min - avg 0.12 sec per episode - avg lifetime reward -0.215$/5min
1200 episodes in  3 min - avg 0.14 sec per episode - avg lifetime reward -0.031 $/5min 1400 episodes in 4 min - avg 0.16 sec per episode - avg lifetime reward 0.143$/5min
1600 episodes in  5 min - avg 0.18 sec per episode - avg lifetime reward 0.314 $/5min 1800 episodes in 6 min - avg 0.19 sec per episode - avg lifetime reward 0.484$/5min
2000 episodes in  7 min - avg 0.20 sec per episode - avg lifetime reward 0.651 $/5min 2200 episodes in 8 min - avg 0.22 sec per episode - avg lifetime reward 0.813$/5min
2400 episodes in  9 min - avg 0.23 sec per episode - avg lifetime reward 0.969 $/5min 2600 episodes in 10 min - avg 0.24 sec per episode - avg lifetime reward 1.130$/5min
2800 episodes in 12 min - avg 0.25 sec per episode - avg lifetime reward 1.281 $/5min 3000 episodes in 13 min - avg 0.26 sec per episode - avg lifetime reward 1.432$/5min
3200 episodes in 15 min - avg 0.27 sec per episode - avg lifetime reward 1.570 $/5min 3400 episodes in 16 min - avg 0.29 sec per episode - avg lifetime reward 1.696$/5min
3600 episodes in 18 min - avg 0.30 sec per episode - avg lifetime reward 1.807 $/5min 3800 episodes in 20 min - avg 0.31 sec per episode - avg lifetime reward 1.904$/5min
4000 episodes in 21 min - avg 0.32 sec per episode - avg lifetime reward 1.990 $/5min 4200 episodes in 23 min - avg 0.33 sec per episode - avg lifetime reward 2.065$/5min
4400 episodes in 25 min - avg 0.34 sec per episode - avg lifetime reward 2.139 $/5min 4600 episodes in 27 min - avg 0.36 sec per episode - avg lifetime reward 2.205$/5min
4800 episodes in 29 min - avg 0.37 sec per episode - avg lifetime reward 2.265 $/5min 4999 episodes in 32 min - avg 0.38 sec per episode - avg lifetime reward 2.265$/5min

In [7]:
#  now our experiment is over, we can look at the results
#  create an object to collect data and create figures
hist = EternityVisualizer(agent, env)

#  implement the functionality of the EternityVisualizer class
agent_outputs, env_outputs = hist.output_results(save_data=False)

/Users/adam/anaconda3/envs/energy_py/lib/python3.5/site-packages/matplotlib/pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (matplotlib.pyplot.figure) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam figure.max_open_warning).
max_open_warning, RuntimeWarning)

In [8]:
#  the Eternity Visualizer has created a number of figures from
#  the agent and environment
hist.figs_dict.keys()

Out[8]:
dict_keys(['new_charge', 'scaled_targets', 'loss', 'done', 'max_Q_acting estimates', 'train_error', 'unscaled_targets', 'max_scaled_target', 'losses', 'reward', 'old_charge', 'avg_Q_acting estimates', 'env_panel_fig', 'scaled_obs', 'steps', 'avg_scaled_target', 'gross_rate', 'Q act est', 'net_stored', 'reward_panel', 'epsilon', 'electricity_price'])
In [9]:
# jupyter magic command to get matplotlib to play nice
%matplotlib inline

#  the most useful figure is the reward panel, showing:
#  1 reward per episode (and max reward seen so far)
#  2 the mean total reward of the last 10% of episodes
#    shaded = the standard deviation (also last 10% of episodes)
hist.figs_dict['reward_panel']

Out[9]:
In [10]:
#  we can also take a look at a figure generated by the Battery env
#  showing the results of the last episode
#  1 - the gross rate of charge/discharge
#  2 - the charge level of the battery at the end of each step
#  3 - the electricity price
hist.figs_dict['env_panel_fig']

Out[10]:
In [11]:
#  we can see from the loss function where we update the target
#  network parameters, as the loss spikes up
hist.figs_dict['loss']

Out[11]:
In [12]:
#  we can have a look at the epsilon decay schedule
hist.figs_dict['epsilon']

Out[12]: