It is a gradient descent algorithm for classification implemented from scratch using numpy library.
- Numpy
- Matplotlib
- Pandas
train_data_frame = pd.read_csv('train_dataset.csv', header=None)
test_data_frame = pd.read_csv('test_dataset.csv', header=None)
train_dataset = np.array(train_data_frame)
test_dataset = np.array(test_data_frame)
train_lable = np.array([train_dataset[:, 0]])
train_data = np.array(train_dataset[:, 1:785]).T
test_lable = np.array([test_dataset[:, 0]]).T
test_data = np.array(test_dataset[:, 1:785])
It is good practice to shuffle data at first
numpy.random.shuffle() will shuffle array in place
np.random.shuffle(train_dataset)
In This network, weights are initialized randomly while biases are initialized zero as a list of numpy array
def __init__(self, size):
self.biases = [np.zeros([y, 1]) for y in size[1:]]
self.weights = [np.random.randn(y, x)*0.01 for x, y in zip(size[:-1], size[1:])]
Mini Batch Size is size of input data flowing through network at a time for calculating error as a whole
Learning Rate Alpha decides the rate at which, weights and biases will update while back propagation
Number of Epochs decides number of times, the whole dataset will be used to train the network
Set Mini Batch Size to 1/10th of total data available. And update it manually after every train of network to find its optimum value
Alpha should be selected such that learning isn't very slow as well as it didn't take long jump or else, network will start diverging from local minima
Number of epochs are selected such that network don't overfit itself over noise
def train_feed_forward(self, size, input, activators):
self.z = [np.zeros([y, input]) for y in size[:]]
i=0
self.z[0] = input
for bias, weight in zip(self.biases, self.weights):
input = (np.dot(weight, input) + bias)
self.z[i+1] = input
input = activation(input)
i=i+1
return input
Applying activation functions will change nature of network from linear from to non linear so that it could fit the outputs more accurately or else, it would be no different than any linear regression
def sigmoid(z, derivative=False):
if derivative==True:
return (activator.sigmoid(z=z, derivative=False) * (1 - activator.sigmoid(z=z, derivative=False)))
return (1.0 / (1.0 + np.exp(-z)))
def softmax(z, derivative=False):
if derivative==True:
return (activator.softmax(z=z, derivative=False) * (1 - activator.softmax(z=z, derivative=False)))
return (np.exp(z) / np.sum(np.exp(z)))
def tanh(z, derivative=False):
if derivative==True:
return (activator.tanh(z=z, derivative=False) * (1 - activator.tanh(z=z, derivative=False)))
return (np.tanh(z))
def relu(z, derivative=False):
if derivative==True:
der_z = np.zeros(z.shape)
for i in range(len(z.shape)):
for j in range(len(z[i])):
if(z[i, j]>0):
der_z[i, j] = 1
return der_z
return (np.maximum(z, 0))
def loss(self, Y, Y_hat, derivative=False):
if derivative==True:
return (Y_hat-Y)
return ((Y_hat - Y) ** 2)
In ANN, output will depend on every neuron it pass through
For output layer, we have label according to which, it is possible to find it's expected value
But for all other layers, there is no single solution available
So, finding optimum value is little harder for that
delta_nabla = self.find_nabla(size=size, activators=activators, mini_batch=mini_batch, mini_batch_size=mini_batch_size, y=y, alpha=alpha)
y_hat = self.train_feed_forward(size=size, input=mini_batch, activators=activators, mini_batch_size=mini_batch_size)
delta_nabla_b = [np.zeros([y, 1]) for y in size[1:]]
delta_nabla_w = [np.zeros([y, x]) for x, y in zip(size[:-1], size[1:])]
delta = self.loss(Y=y, Y_hat=y_hat, derivative=True) * activator.sigmoid(z=y_hat, derivative=True)
delta_nabla_b[-1] += np.sum(delta)
delta_nabla_w[-1] += np.dot(delta, self.z[-2].T)
delta = np.dot(self.weights[layer_no].T, delta) * activator.sigmoid(z=self.z[layer_no-1], derivative=True)
delta_nabla_b[layer_no-1] += np.sum(delta)
delta_nabla_w[layer_no-1] += np.dot(delta, self.z[layer_no-2].T)
delta_nabla = [delta_nabla_b, delta_nabla_w]
self.biases = [b-((alpha/mini_batch_size)*n_b) for b, n_b in zip(self.biases, delta_nabla[0])]
self.weights = [w-((alpha/mini_batch_size)*n_w) for w, n_w in zip(self.weights, delta_nabla[1])]
In size_layers, define number of neurons on every layer of network and activation functions of every layer in activations
neuron_layer = {"size_layers": [784, 2800, 10], "activations": ["tanh", "sigmoid"] }
my_network = network(neuron_layer["size_layers"])
my_network.grad_descn(size=neuron_layer["size_layers"], expected_value=train_lable, training_data=train_data, activators=neuron_layer["activations"], alpha=0.01, mini_batch_size=2000, epochs=40)
result = test_feed_forward(size=neuron_layer["size_layers"], input=test_data.T, activators=neuron_layer["activations"])
no_trues = 0
for i in range(len(test_data)):
max_ans = result[0, i]
max_ind = 0
for j in range(10):
if(result[j, i]>max_ans):
max_ind = j
max_ans = result[j, i]
if(test_lable[i]==max_ind):
no_trues+=1