-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_planner.py
148 lines (121 loc) · 5.88 KB
/
main_planner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# libraries:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
# environment and utils:
from folsom.folsom import Folsom
from machinery import *
from agents import Planner
# based on example at: https://keras.io/examples/rl/ddpg_pendulum/
# implements the three TD3 adjustments to original DDPG algorithm outlined in Fujimoto et al. (2018)
def main():
DATA_DIR = Path.cwd()
STOR_DIR = DATA_DIR / 'results_planner_10'
STOR_DIR.mkdir(exist_ok=True)
# OLD_DIR = DATA_DIR / 'results_planner_5'
OLD_DIR = None
num_res_states = 2 # day-of-year and reservoir storage
inflow_stack = 15 # 15 future inflow features at various time scales between 1 day and 5 years
TD3 = True # use TD3 adjustments to DDPG from Fujimoto et al. (2018)
warmup = True # use warmup period, fill buffer w/ random uniform action selection experiences
eps = 0.1 # if not None, use epsilon-greedy action exploration strategy
epi_start = 30 * 365 # day of ensemble member to start episode on
epi_steps = 40 * 365 # length of episode in days
max_epi = 100 # upper bound used to determine annealing schedule (if not epsilon-greedy)
models = ['canesm2',]
ensembles = ['r1i1p1' for i in np.arange(0,10)]
env = Folsom(DATA_DIR,res_dim=(num_res_states+inflow_stack,),inflow_stack=inflow_stack,models=models,ensembles=ensembles,) # policy: storage, inflows, day of year
agent, env = Planner.create(env,weights_dir=OLD_DIR,TD3=TD3,warmup=warmup,eps=eps,epi_start=epi_start,epi_steps=epi_steps,max_epi=max_epi) # this has actor/critic and target networks contained within
obs = env.reset(agent)
# Get the environment and extract the number of actions.
env.seed(123) # not really used yet
assert len(env.action_space.shape) == 1 # one continuous reservoir release as the action
num_states = env.observation_space.shape
print("Size of State Space -> {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space -> {}".format(num_actions))
num_res_states = env.reservoir_space.shape
print("Size of Reservoir State Space -> {}".format(num_res_states))
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]
print("Max Value of Action -> {}".format(upper_bound))
print("Min Value of Action -> {}".format(lower_bound))
# training hyperparameters
std_dev = 1.
agent.noise_object = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
# learning rate for actor-critic models:
actor_lr = 10e-7 # slowest learner
critic_lr = 10e-7 # should learn faster than actor
# learning rate used to update target networks:
tau = 10e-2 # the target actor and critic networks slowly take the actor/critic weights
# initialize optimizers:
clipnorm = 0.1
# critic_opt = tf.keras.optimizers.SGD(critic_lr)#,clipnorm=clipnorm,momentum=0.9) # also sorta worked
# actor_opt = tf.keras.optimizers.SGD(actor_lr)#,clipnorm=clipnorm,momentum=0.9)
agent.critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
if TD3:
agent.critic2_optimizer = tf.keras.optimizers.Adam(critic_lr)
agent.actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
# discount factor for future rewards
gamma = 0.99
batch_size = 100
buffer_capacity = 25*epi_steps
agent.buffer = Buffer(env, agent, buffer_capacity, batch_size, gamma)
"""
Implement main training loop, and iterate over episodes.
Sample actions using `policy()` and train with `learn()` at each time step.
Update the Target networks at a rate `tau`.
"""
# ensembles = ['r{}i1p1'.format(i) for i in np.arange(1,2)]
ensembles = ['r1i1p1' for i in np.arange(0,10)]
results = {} # not used yet
if warmup:
agent.buffer.warmup(env,agent,OLD_DIR)
for ens in ensembles:
print('Running data from ensemble member: {}'.format(ens))
average_reward, average_action, ens_done = 0, 0, False
while ens_done == False:
agent.epi_count += 1
anneal_lr([agent.critic_optimizer,agent.critic2_optimizer,agent.actor_optimizer],agent.epi_count,agent.max_epi,method='linear')
prev_state = env.reset(agent)
prev_res_state, episodic_reward, episodic_action, agent.epi_done = prev_state, 0, 0, False
while agent.epi_done == False:
tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
action, noise = agent.policy(tf_prev_state)
state, reward, info = env.step(action, noise)
res_state, agent.ens_done, agent.epi_done = state, info['ens_done'], info['epi_done']
episodic_reward += reward
average_reward = average_reward+(reward-average_reward)/env.t
average_action = average_action+(action[0]-average_action)/env.t
if env.t % 1000 == 0:
env.render(mode=['console'])
agent.buffer.record((prev_state, prev_res_state, action, reward, state, res_state))
if agent.buffer.TD3:
if env.t % 2 == 0:
agent.buffer.learn_actor_critic(agent)
update_target(agent.target_actor.variables, agent.actor.variables, tau)
update_target(agent.target_critic.variables, agent.critic.variables, tau)
update_target(agent.target_critic2.variables, agent.critic2.variables, tau)
else:
agent.buffer.learn_critic(agent)
else:
agent.buffer.learn(agent)
update_target(agent.target_actor.variables, agent.actor.variables, tau)
update_target(agent.target_critic.variables, agent.critic.variables, tau)
prev_state = state
prev_res_state = res_state
agent.epi_reward_list.append(episodic_reward)
agent.avg_reward_list.append(average_reward)
agent.avg_action_list.append(average_action)
agent.epi_avg_reward_list.append(np.mean(agent.epi_reward_list[-40:]))
if agent.epi_count % 5 == 0:
# save the weights every n episodes
agent.save_weights(STOR_DIR)
env.render(agent=agent,STOR_DIR=STOR_DIR,mode=['console','figures'])
print("Episode * {} * Avg Reward is ==> {}".format(agent.epi_count, agent.epi_avg_reward_list[-1]))
if agent.epi_count >= max_epi:
exit()
if __name__ == '__main__':
main()