-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
196 lines (150 loc) · 7.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import numpy as np
from rnn_utils import softmax
def smooth(loss, curr_loss):
"""
Smooth the loss using exponentialy weighted averages.
Parameters
-----------
loss : Last computed loss (previous loss)
curr_loss : currently computed loss
Returns
----------
smoothed_loss : float
Smoothed loss across the last 1000 losses(aprox.).
"""
beta = 0.999
return beta * loss + (1 - beta) * curr_loss
def print_sample(sample_idx, idx2char):
"""
Convert indices of the sample to charecters and join them to construct a sentence.
"""
text = ''.join(idx2char[ch] for ch in sample_idx)
print(f'----\n {text} \n----')
def get_initial_loss(vocab_size, seq_length):
return - np.log(1.0 / vocab_size) * seq_length
def initialize_parameters(n_a, n_x, n_y):
"""
Initialize parameters with small random values
Returns:
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
"""
Wax = np.random.randn(n_a, n_x) * 0.01 # input to hidden
Waa = np.random.randn(n_a, n_a) * 0.01 # hidden to hidden
Wya = np.random.randn(n_y, n_a) * 0.01 # hidden to output
b = np.zeros((n_a, 1)) # hidden bias
by = np.zeros((n_y, 1)) # output bias
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
return parameters
def rnn_step_forward(parameters, a_prev, x):
"""
One iteration of forward pass on layer
Parameters
----------
parameters : python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
a_prev : ndarray of the activations from the previous time step
x : ndarray of the inpute
Returns
----------
a_next : ndarray
activation of the next time step
p_t : ndarray
y hat of the current input
"""
Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
a_next = np.tanh(np.dot(np.concatenate((Waa, Wax), axis=1), np.concatenate((a_prev, x), axis=0)) + b) # hidden state
p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars
return a_next, p_t
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
"""
One iteration of backward pass on layer
Parameters
----------
dy : Gradient of the output, numpy array of shape (ny, 1)
gradients : python dictionary containig:
dWax -- Gradient of the Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
dWaa -- Gradient of the Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
dWya -- Gradient of the Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
db -- Gradient of the Bias, numpy array of shape (n_a, 1)
dby -- Gradient of the bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
parameters : python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
x : ndarray of the inpute
a : ndarray of the activations from the current time step
a_prev : ndarray of the activations from the previous time step
Returns
----------
Gradients
"""
gradients['dWya'] += np.dot(dy, a.T)
gradients['dby'] += dy
da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
daraw = (1 - a * a) * da # backprop through tanh nonlinearity
gradients['db'] += daraw
gradients['dWax'] += np.dot(daraw, x.T)
gradients['dWaa'] += np.dot(daraw, a_prev.T)
gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
return gradients
def update_parameters(parameters, gradients, lr):
"""
Update the parameters of the network with Gradient descent
"""
parameters['Wax'] += -lr * gradients['dWax']
parameters['Waa'] += -lr * gradients['dWaa']
parameters['Wya'] += -lr * gradients['dWya']
parameters['b'] += -lr * gradients['db']
parameters['by'] += -lr * gradients['dby']
return parameters
def rnn_forward(X, Y, a0, parameters, vocab_size = 71):
"""
Perform a forward pass on the network.
"""
# Initialize x, a and y_hat as empty dictionaries
x, a, y_hat = {}, {}, {}
a[-1] = np.copy(a0)
# initialize your loss to 0
loss = 0
for t in range(len(X)):
# Set x[t] to be the one-hot vector representation of the t'th character in X.
x[t] = np.zeros((vocab_size,1))
x[t][X[t]] = 1
# Run one step forward of the RNN
a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
# Update the loss by substracting the cross-entropy term of this time-step from it.
loss -= np.log(y_hat[t][Y[t],0])
cache = (y_hat, a, x)
return loss, cache
def rnn_backward(X, Y, parameters, cache):
"""
Perform a backward pass on the network.
"""
# Initialize gradients as an empty dictionary
gradients = {}
# Retrieve from cache and parameters
(y_hat, a, x) = cache
Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
# each one should be initialized to zeros of the same dimension as its corresponding parameter
gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
gradients['da_next'] = np.zeros_like(a[0])
### START CODE HERE ###
# Backpropagate through time
for t in reversed(range(len(X))):
dy = np.copy(y_hat[t])
dy[Y[t]] -= 1
gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
### END CODE HERE ###
return gradients, a