-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrnn.py
85 lines (67 loc) · 2.43 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
from numpy.random import randn
class RNN:
# A many-to-one Vanilla Recurrent Neural Network.
#初始化RNN的权重和偏置
def __init__(self, input_size, output_size, hidden_size=64):
# Weights
self.Whh = randn(hidden_size, hidden_size) / 1000 #标准正态分布初始化权重,除以1000以减少权重的初始方差
self.Wxh = randn(hidden_size, input_size) / 1000
self.Why = randn(output_size, hidden_size) / 1000
# Biases
self.bh = np.zeros((hidden_size, 1))
self.by = np.zeros((output_size, 1))
def forward(self, inputs):
'''
Perform a forward pass of the RNN using the given inputs.
Returns the final output and hidden state.
- inputs is an array of one hot vectors with shape (input_size, 1).
'''
h = np.zeros((self.Whh.shape[0], 1))
self.last_inputs = inputs
self.last_hs = { 0: h }
# Perform each step of the RNN
for i, x in enumerate(inputs):
h = np.tanh(self.Wxh @ x + self.Whh @ h + self.bh)
self.last_hs[i + 1] = h
# Compute the output
y = self.Why @ h + self.by
return y, h
def backprop(self, d_y, learn_rate=2e-2):
'''
Perform a backward pass of the RNN.
- d_y (dL/dy) has shape (output_size, 1).
- learn_rate is a float.
'''
n = len(self.last_inputs)
# Calculate dL/dWhy and dL/dby.
d_Why = d_y @ self.last_hs[n].T
d_by = d_y
# Initialize dL/dWhh, dL/dWxh, and dL/dbh to zero.
d_Whh = np.zeros(self.Whh.shape)
d_Wxh = np.zeros(self.Wxh.shape)
d_bh = np.zeros(self.bh.shape)
# Calculate dL/dh for the last h.
# dL/dh = dL/dy * dy/dh
d_h = self.Why.T @ d_y
# Backpropagate through time.
for t in reversed(range(n)):
# An intermediate value: dL/dh * (1 - h^2)
temp = ((1 - self.last_hs[t + 1] ** 2) * d_h)
# dL/db = dL/dh * (1 - h^2)
d_bh += temp
# dL/dWhh = dL/dh * (1 - h^2) * h_{t-1}
d_Whh += temp @ self.last_hs[t].T
# dL/dWxh = dL/dh * (1 - h^2) * x
d_Wxh += temp @ self.last_inputs[t].T
# Next dL/dh = dL/dh * (1 - h^2) * Whh
d_h = self.Whh @ temp
# Clip to prevent exploding gradients.
for d in [d_Wxh, d_Whh, d_Why, d_bh, d_by]:
np.clip(d, -1, 1, out=d)
# Update weights and biases using gradient descent.
self.Whh -= learn_rate * d_Whh
self.Wxh -= learn_rate * d_Wxh
self.Why -= learn_rate * d_Why
self.bh -= learn_rate * d_bh
self.by -= learn_rate * d_by