-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
138 lines (127 loc) · 4.55 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import torch
class Linear(object):
@staticmethod
def forward(x, w, b):
"""
Computes the forward pass for an linear (fully-connected) layer.
The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
examples, where each example x[i] has shape (d_1, ..., d_k). We will
reshape each input into a vector of dimension D = d_1 * ... * d_k, and
then transform it to an output vector of dimension M.
Inputs:
- x: A tensor containing input data, of shape (N, d_1, ..., d_k)
- w: A tensor of weights, of shape (D, M)
- b: A tensor of biases, of shape (M,)
Returns a tuple of:
- out: output, of shape (N, M)
- cache: (x, w, b)
"""
out = None
N = x.shape[0]
x_reshaped = x.view(N, -1)
out = torch.mm(x_reshaped, w) + b
cache = (x, w, b)
return out, cache
@staticmethod
def backward(dout, cache):
"""
Computes the backward pass for an linear layer.
Inputs:
- dout: Upstream derivative, of shape (N, M)
- cache: Tuple of:
- x: Input data, of shape (N, d_1, ... d_k)
- w: Weights, of shape (D, M)
- b: Biases, of shape (M,)
Returns a tuple of:
- dx: Gradient with respect to x, of shape
(N, d1, ..., d_k)
- dw: Gradient with respect to w, of shape (D, M)
- db: Gradient with respect to b, of shape (M,)
"""
x, w, b = cache
dx, dw, db = None, None, None
dx = torch.matmul(dout, w.t()).view(*x.shape)
dw = torch.matmul(x.view(x.shape[0], -1).t(), dout)
db = torch.sum(dout, dim=0)
return dx, dw, db
class ReLU(object):
@staticmethod
def forward(x):
"""
Computes the forward pass for a layer of rectified
linear units (ReLUs).
Input:
- x: Input; a tensor of any shape
Returns a tuple of:
- out: Output, a tensor of the same shape as x
- cache: x
"""
out = torch.maximum(torch.tensor(0, dtype=x.dtype, device=x.device), x)
cache = x
return out, cache
@staticmethod
def backward(dout, cache):
"""
Computes the backward pass for a layer of rectified
linear units (ReLUs).
Input:
- dout: Upstream derivatives, of any shape
- cache: Input x, of same shape as dout
Returns:
- dx: Gradient with respect to x
"""
dx, x = None, cache
dx = dout * (cache > 0).float()
return dx
def adam(w, dw, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: Scalar learning rate.
- beta1: Decay rate for moving average of first moment of gradient.
- beta2: Decay rate for moving average of second moment of gradient.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- m: Moving average of gradient.
- v: Moving average of squared gradient.
- t: Iteration number.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', torch.zeros_like(w))
config.setdefault('v', torch.zeros_like(w))
config.setdefault('t', 0)
next_w = None
config['t'] += 1
config['m'] = config['beta1']*config['m'] + (1-config['beta1'])*dw
mt = config['m'] / (1-config['beta1']**config['t'])
config['v'] = config['beta2']*config['v'] + (1-config['beta2'])*(dw*dw)
vc = config['v'] / (1-(config['beta2']**config['t']))
w = w - (config['learning_rate'] * mt)/ (torch.sqrt(vc) + config['epsilon'])
next_w = w
return next_w, config
def softmax_loss(x, y):
"""
Computes the loss and gradient for softmax classification.
Inputs:
- x: Input data, of shape (N, C) where x[i, j] is the score for
the jth class for the ith input.
- y: Vector of labels, of shape (N,) where y[i] is the label
for x[i] and 0 <= y[i] < C
Returns a tuple of:
- loss: Scalar giving the loss
- dx: Gradient of the loss with respect to x
"""
shifted_logits = x - x.max(dim=1, keepdim=True).values
Z = shifted_logits.exp().sum(dim=1, keepdim=True)
log_probs = shifted_logits - Z.log()
probs = log_probs.exp()
N = x.shape[0]
loss = (-1.0 / N) * log_probs[torch.arange(N), y].sum()
dx = probs.clone()
dx[torch.arange(N), y] -= 1
dx /= N
return loss, dx