-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModel.py
83 lines (66 loc) · 2.92 KB
/
Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import torch
import torch.nn as nn
import torch.optim as optim
import os
class DeepQNetwork(nn.Module):
def __init__(self, inputs, lr, gamma):
super().__init__()
self.fcn = nn.Sequential(nn.Linear(inputs, 1024),
# nn.ReLU(),
# nn.Linear(256, 120),
nn.ReLU(),
nn.Linear(1024, 3))
self.lr = lr
self.gamma = gamma
self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
self.criterion = nn.MSELoss()
def forward(self, x):
return self.fcn(x)
def save(self, file_name='model.pth'):
directory = './model'
if not os.path.exists(directory):
os.mkdir(directory)
full_path = os.path.join(directory, file_name)
torch.save(self.state_dict(), full_path)
def DeepQTrainer(self, state, action, reward, future_state, done):
# Self is the model
"""
This function will be responsible for the short term and long term training of the model.
:param state: Its the current state of the agent -> contains an array of eleven indicators
:type state: list / list(list)
:param action: Its the action taken given the state -> contains an array of 3 indicators
:type action: list / list(list)
:param reward: Its the reward given to the agent based on the action it took
:type reward: int / list(int)
:param future_state: Its the prediction of the model based on the other parameters -> array of eleven indicators
:type future_state: list / list(list)
:param done:
:type done:
:return:
:rtype:
"""
state = torch.tensor(state, dtype=torch.float)
action = torch.tensor(action, dtype=torch.float)
reward = torch.tensor(reward, dtype=torch.float)
future_state = torch.tensor(future_state, dtype=torch.float)
if len(state.shape) == 1:
state = torch.unsqueeze(state, dim=0)
action = torch.unsqueeze(action, dim=0)
reward = torch.unsqueeze(reward, dim=0)
future_state = torch.unsqueeze(future_state, dim=0)
done = (done, )
# Implementation of Bellman's Equation
# 1 - Predicted Q values with the current state
pred = self(state)
# 2 - Reward + (gamma * max(future predicted Q values)) -> if not done (for short memory)
target = pred.clone()
for i, item in enumerate(done):
# This reward is the one used when the agent looses
future_Q = reward[i]
if not done[i]:
future_Q = reward[i] + (self.gamma * torch.max(self(future_state[i])))
target[i][torch.argmax(action).item()] = future_Q
self.optimizer.zero_grad()
loss = self.criterion(target, pred)
loss.backward()
self.optimizer.step()