-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_code.py
303 lines (252 loc) · 12.5 KB
/
run_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
from unityagents import UnityEnvironment
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from collections import deque
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from ddpg_agent import Agent
from d4pg_agent import AgentD4PG
# define model
DDPG = 0
D4PG = 1
model = DDPG
# define train/test
TRAIN = 0
TEST = 1
train_or_test = TEST
# single agent
env_path = '/home/alexander/RLND-project_Updated/p2-continuous-control/Reacher_Linux/Reacher.x86_64'
env = UnityEnvironment(file_name=env_path)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
states = env_info.vector_observations # get the current state (for each agent)
scores = np.zeros(num_agents) # initialize the score (for each agent)
while True:
actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
actions = np.clip(actions, -1, 1) # all actions between -1 and 1
env_info = env.step(actions)[brain_name] # send all actions to tne environment
next_states = env_info.vector_observations # get next state (for each agent)
rewards = env_info.rewards # get reward (for each agent)
dones = env_info.local_done # see if episode finished
scores += env_info.rewards # update the score (for each agent)
states = next_states # roll over states to next time step
if np.any(dones): # exit loop if episode finished
break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
def plot_scores(scores, rolling_window=100):
'''Plot score and its moving average on the same chart.'''
fig = plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, '-c', label='episode score')
plt.title('Episodic Score')
rolling_mean = pd.Series(scores).rolling(rolling_window).mean()
plt.plot(np.arange(len(scores)), rolling_mean, '-y', label='rolling_mean')
plt.ylabel('score')
plt.xlabel('episode #')
plt.legend()
if (model == DDPG):
plt.savefig('DDPG_results.png')
elif (model == D4PG):
plt.savefig('DDPG_results.png')
######################## DDPG ################################################
# train the agent
def train_ddpg(agent, memory, n_episodes=10, mode='train',
actor_pth='./checkpoint/ddpg_actor_checkpoint.pth',
critic_pth='./checkpoint/ddpg_critic_checkpoint.pth'):
'''Set up training's configuration and print out episodic performance measures, such as avg scores, avg loss.
Params
======
agent (class object)
memory (class attribute): agent's attribute for memory size tracking
mode (string): 'train' or 'test', when in test mode, the agent acts in greedy policy only
pth (string path): file name for the checkpoint
'''
scores = []
scores_window = deque(maxlen=100) # last 100 scores
c_loss_window = deque(maxlen=100)
a_loss_window = deque(maxlen=100)
for i_episode in range(1, n_episodes+1):
env_info = env.reset(train_mode=True)[brain_name] # reset the environment and activate train_mode
state = env_info.vector_observations[0] # get the current state
score = 0
agent.running_c_loss = 0
agent.running_a_loss = 0
agent.training_cnt = 0
# agent.reset() # reset OUNoise
while True:
action = agent.act(state, mode)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations[0] # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
agent.step(state, action, reward, next_state, done)
score += reward
state = next_state
if done:
break
scores_window.append(score)
scores.append(score)
c_loss_window.append(agent.running_c_loss/(agent.training_cnt+0.0001)) # avoid zero
a_loss_window.append(agent.running_a_loss/(agent.training_cnt+0.0001)) # avoid zero
print('\rEpisode {:>4}\tAverage Score:{:>6.3f}\tMemory Size:{:>5}\tCLoss:{:>12.8f}\tALoss:{:>10.6f}'.format(
i_episode, np.mean(scores_window), len(memory), np.mean(c_loss_window), np.mean(a_loss_window)), end="")
if i_episode % 100 == 0:
print('\rEpisode {:>4}\tAverage Score:{:>6.3f}\tMemory Size:{:>5}\tCLoss:{:>12.8f}\tALoss:{:>10.6f}'.format(
i_episode, np.mean(scores_window), len(memory), np.mean(c_loss_window), np.mean(a_loss_window)))
if np.mean(scores_window) >= 31:
break
torch.save(agent.actor_local.state_dict(), actor_pth)
torch.save(agent.critic_local.state_dict(), critic_pth)
return scores
# test the agent
def test_ddpg(agent, memory, mode='test',
actor_pth='./checkpoint/ddpg_actor_checkpoint.pth',
critic_pth='./checkpoint/ddpg_critic_checkpoint.pth'):
# load models
agent.actor_local.load_state_dict(torch.load(actor_pth))
agent.critic_local.load_state_dict(torch.load(critic_pth))
agent.actor_local.eval()
agent.critic_local.eval()
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0] # get the current state
time.sleep(10) # sleep for visualization
while True:
action = agent.act(state, mode)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations[0] # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
agent.step(state, action, reward, next_state, done)
time.sleep(.1)
state = next_state
if done:
break
# train DDPG model
if (model == DDPG and train_or_test == TRAIN):
agent_ddpg = Agent(state_size=33, action_size=4, seed=199)
ddpg_ep_rewards_list = train_ddpg(agent=agent_ddpg, memory=agent_ddpg.memory,
n_episodes=5000)
ddpg_ep_rewards_list = np.array(ddpg_ep_rewards_list)
np.save('./data/ddpg_ep_rewards_list.npy', ddpg_ep_rewards_list)
plot_scores(ddpg_ep_rewards_list)
# test DDPG model and visualize
if (model == DDPG and train_or_test == TEST):
# define actor/critic model
agent_ddpg = Agent(state_size=33, action_size=4, seed=199)
# test model
test_ddpg(agent=agent_ddpg, memory=agent_ddpg.memory)
################################## D4PG #######################################
# train the agent
def train_d4pg(agent, memory, n_episodes=10, mode='train',
actor_pth='./checkpoint/d4pg_actor_checkpoint.pth',
critic_pth='./checkpoint/d4pg_critic_checkpoint.pth'):
'''Set up training's configuration and print out episodic performance measures, such as avg scores, avg loss.
Params
======
agent (class object)
memory (class attribute): agent's attribute for memory size tracking
mode (string): 'train' or 'test', when in test mode, the agent acts in greedy policy only
pth (string path): file name for the checkpoint
'''
scores = []
scores_window = deque(maxlen=100) # last 100 scores
c_loss_window = deque(maxlen=100)
a_loss_window = deque(maxlen=100)
for i_episode in range(1, n_episodes+1):
env_info = env.reset(train_mode=True)[brain_name] # reset the environment and activate train_mode
state = env_info.vector_observations # get the current state
score = 0
agent.running_c_loss = 0
agent.running_a_loss = 0
agent.training_cnt = 0
# agent.reset() # reset OUNoise
while True:
action = agent.act(state, mode)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
agent.step(state, action, reward, next_state, done)
score += reward
state = next_state
if done:
break
scores_window.append(score)
scores.append(score)
c_loss_window.append(agent.running_c_loss/(agent.training_cnt+0.0001)) # avoid zero
a_loss_window.append(agent.running_a_loss/(agent.training_cnt+0.0001)) # avoid zero
print('\rEpisode {:>4}\tAverage Score:{:>6.3f}\tMemory Size:{:>5}\tCLoss:{:>12.8f}\tALoss:{:>10.6f}'.format(
i_episode, np.mean(scores_window), len(memory), np.mean(c_loss_window), np.mean(a_loss_window)), end="")
if i_episode % 100 == 0:
print('\rEpisode {:>4}\tAverage Score:{:>6.3f}\tMemory Size:{:>5}\tCLoss:{:>12.8f}\tALoss:{:>10.6f}'.format(
i_episode, np.mean(scores_window), len(memory), np.mean(c_loss_window), np.mean(a_loss_window)))
if np.mean(scores_window) >= 31:
break
torch.save(agent.actor_local.state_dict(), actor_pth)
torch.save(agent.critic_local.state_dict(), critic_pth)
return scores
# test the agent
def test_d4pg(agent, memory, n_episodes, mode='test',
actor_pth='./checkpoint/d4pg_actor_checkpoint.pth',
critic_pth='./checkpoint/d4pg_critic_checkpoint.pth'):
# load models
agent.actor_local.load_state_dict(torch.load(actor_pth))
agent.critic_local.load_state_dict(torch.load(critic_pth))
agent.actor_local.eval()
agent.critic_local.eval()
for i in range(10):
env_info = env.reset(train_mode=False)[brain_name]
time.sleep(.1)
for i in range(n_episodes):
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0] # get the current state
time.sleep(5) # sleep for visualization
while True:
action = agent.act(state, mode)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations[0] # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
agent.step(state, action, reward, next_state, done)
#time.sleep(.1)
state = next_state
if done:
break
# train d4pg
if (model == D4PG and train_or_test == TRAIN):
agent_d4pg = AgentD4PG(state_size=33, action_size=4, seed=168)
#from IPython.core.debugger import set_trace
#set_trace()
d4pg_ep_rewards_list = train_d4pg(agent=agent_d4pg, memory=agent_d4pg.memory,
n_episodes=10000,
mode = "train",
actor_pth='./checkpoint/d4pg_actor_checkpoint.pth',
critic_pth='./checkpoint/d4pg_critic_checkpoint.pth')
d4pg_ep_rewards_list = np.array(d4pg_ep_rewards_list)
np.save('./data/d4pg_ep_rewards_list.npy', d4pg_ep_rewards_list)
plot_scores(d4pg_ep_rewards_list)
# test D4PG model and visualize
if (model == D4PG and train_or_test == TEST):
# define actor/critic model
agent_d4pg = AgentD4PG(state_size=33, action_size=4, seed=68)
# test model
test_d4pg(agent=agent_d4pg, memory=agent_d4pg.memory, n_episodes=1)