-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenvironment.py
112 lines (89 loc) · 3.02 KB
/
environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
class Environment:
'''
Environment class which specifies how the agent moves through a given maze
'''
def __init__(self, p):
'''
Initialise the class instance
'''
self.num_x_states = p["num_x_states"]
self.num_y_states = p["num_y_states"]
self.num_actions = p["num_actions"]
self.goal_state = p["goal_state"]
self.reward_at_goal = p["reward_at_goal"]
self.blocked_states = p["blocked_states"]
self.start_state = p["start_state"]
self.num_states = self.num_x_states * self.num_y_states
self._generate_env_model()
return None
def _generate_env_model(self):
'''
Generate transition and reward models
'''
self.T = np.zeros((self.num_states, self.num_actions, self.num_states))
self.R = np.zeros((self.num_states, self.num_actions), dtype=int)
for s in range(self.num_states):
for a in range(self.num_actions):
s1, r = self._get_new_state(s, a)
self.T[s, a, s1] += 1
self.R[s, a] = r
# normalise
for s in range(self.num_states):
for a in range(self.num_actions):
if ~np.all(self.T[s, a, :] == 0):
self.T[s, a, :] /= np.sum(self.T[s, a, :])
return None
def _get_new_state(self, s, a):
'''
Returns next state and reward
params:
s: state
a: action
'''
i, j = self._convert_state_to_coords(s)
s1, r = None, 0
# check boundaries
if i == 0: # top end
if a == 0: # action up
return s, r
elif i == self.num_y_states - 1: # bottom end
if a == 1: # action down
return s, r
if j == 0: # left end
if a == 2: # move left
return s, r
elif j == self.num_x_states - 1: # right end
if a == 3: # move right
return s, r
if a == 0: # move up
ni, nj = i-1, j
elif a == 1: # move down
ni, nj = i+1, j
elif a == 2: # move left
ni, nj = i, j-1
else: # move right
ni, nj = i, j+1
s1 = self._convert_coords_to_state(ni, nj)
# check blocked states
if s1 in self.blocked_states:
return s, r
# check if goal
if s1 == self.goal_state:
r = self.reward_at_goal
return s1, r
def _convert_state_to_coords(self, s):
'''
Convert state to coordinates to index the maze
params:
s: state
'''
return s//self.num_x_states, s%self.num_x_states
def _convert_coords_to_state(self, i, j):
'''
Convert coordinates to state
params:
i: y coordinate
j: x coordinate
'''
return np.arange(self.num_y_states*self.num_x_states).reshape(self.num_y_states, self.num_x_states)[i][j]