Skip to content

Commit

Permalink
Inital
Browse files Browse the repository at this point in the history
  • Loading branch information
red4711 committed Dec 19, 2016
1 parent f971ec1 commit 6239076
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 0 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#Program overview
This github contains two different python script: one to train the neural network model and one to test it against a given data sets.
The neural network is a recurrent neural network (LTSM) of 3 layer each with 256 nodes.
The network takes in a 256^2 sized array which represents an equivalent spectrogram of the spoken word audio file. The data is already preprocessed.
The neural network outputs the first letter of the letter. The resulted accuracy of the best neural network configuration is 40%.


Program dependencies: Tensorflow and scikit-image

To run the predictions on a trained model: run test-deep-net.py
Outputs will be averages for N number of batches

To run training algorithm:
You will need to acquire the data set at https://www.dropbox.com/s/2ff0x8z60bjjz7b/spoken_words.tar?dl=0
Extract all the data into a folder call spoken_words relative to your current directory of deep-net.py
run deep-net.py
100 changes: 100 additions & 0 deletions deep-net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#! /usr/bin/python35

import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
from scipy.misc import imread, imresize
import matplotlib.pyplot as plt
import speech_data
from speech_data import Source, Target

n_classes = 32
hm_epochs = 5
batch_size = 64
chunk_size = 512
n_chunks = 512
rnn_size = 256
n_layers = 3
width = 512
n_steps = 256
learning_rate = 0.001

batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter)

with tf.name_scope('X'):
x = tf.placeholder('float', [None, n_chunks, chunk_size])
with tf.name_scope('Observed_Values'):
y = tf.placeholder('float')

def recurrent_neural_network(x):
with tf.name_scope('Weights'):
weights = tf.Variable(tf.random_normal([rnn_size, n_classes]))
with tf.name_scope('Bias'):
bias = tf.Variable(tf.random_normal([n_classes]))

layer = {'weights': weights, 'biases': bias}

with tf.name_scope('Input'):
x = tf.transpose(x, [1,0,2])
x = tf.reshape(x, [-1, chunk_size])
x = tf.split(0, n_chunks, x)

with tf.name_scope('Layers'):
lstm_cell = rnn_cell.BasicLSTMCell(rnn_size)
lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers)

outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)

with tf.name_scope('Prediction'):
output = tf.matmul(outputs[-1], layer['weights']) + layer['biases']

return output


def train_neural_network(x):
prediction = recurrent_neural_network(x)
with tf.name_scope('Cost'):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y))
with tf.name_scope('Optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

with tf.name_scope('Accuracy'):
correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

with tf.name_scope('Summaries'):
tf.scalar_summary("cost", cost)
tf.scalar_summary("accuracy", accuracy)

summary_op = tf.merge_all_summaries()

with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
writer = tf.train.SummaryWriter('./Tensorboard/', sess.graph)

with tf.name_scope('Testing_Data'):
test_X, test_Y = next(batch)
test_X = np.array(test_X)
test_X = test_X.reshape((-1, n_chunks, chunk_size))

for epoch in range(hm_epochs):
epoch_loss = 0
for step in range(n_steps):
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
with tf.name_scope('Training_Data'):
epoch_x, epoch_y = next(batch)
epoch_x = np.array(epoch_x)
epoch_x = epoch_x.reshape((batch_size, n_chunks, chunk_size))
summary, acc = sess.run([summary_op, optimizer], feed_dict = {x:epoch_x, y:epoch_y},
options=run_options, run_metadata=run_metadata)

if (epoch * n_steps + step % 50 == 0):
writer.add_run_metadata(run_metadata, 'step%d' % (epoch * n_steps + step))
writer.add_summary(summary, epoch * n_steps + step)

print('Accuracy', accuracy.eval({x:test_X, y:test_Y}))

if __name__ == '__main__':
train_neural_network(x)
Binary file added model.ckpt
Binary file not shown.
76 changes: 76 additions & 0 deletions speech_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import gzip
import os
import re
import skimage.io # scikit-image
import numpy
import numpy as np
import wave
# import extensions as xx
from random import shuffle
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin

CHUNK = 4096
test_fraction=0.1 # 10% of data for test / verification

class Source: # labels
NUMBER_WAVES = 'spoken_numbers_wav.tar'
DIGIT_WAVES = 'spoken_numbers_pcm.tar'
DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar' # 64x64 baby data set, works astonishingly well
NUMBER_IMAGES = 'spoken_numbers.tar' # width=256 height=256
TEST_INDEX = 'test_index.txt'
TRAIN_INDEX = 'train_index.txt'

from enum import Enum
class Target(Enum): # labels
digits=1
speaker=2
words_per_minute=3
word_phonemes=4
word=5#characters=5
sentence=6
sentiment=7
first_letter=8

def dense_to_one_hot(batch, batch_size, num_labels):
sparse_labels = tf.reshape(batch, [batch_size, 1])
indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
concatenated = tf.concat(1, [indices, sparse_labels])
concat = tf.concat(0, [[batch_size], [num_labels]])
output_shape = tf.reshape(concat, [2])
sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0)
return tf.reshape(sparse_to_dense, [batch_size, num_labels])

def dense_to_one_hot(labels_dense, num_classes=10):
"""Convert class labels from scalars to one-hot vectors."""
return numpy.eye(num_classes)[labels_dense]

def spectro_batch_generator(batch_size=10,width=64,target=Target.digits):
path="spoken_words"
height = width
batch = []
labels = []
if target==Target.digits: num_classes=10
if target==Target.first_letter: num_classes=32
files = os.listdir(path)
# shuffle(files) # todo : split test_fraction batch here!
# files=files[0:int(len(files)*(1-test_fraction))]
print("Got %d source data files from %s"%(len(files),path))
while True:
# print("shuffling source data files")
shuffle(files)
for image_name in files:
if not "_" in image_name: continue # bad !?!
image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32)
data = image / 255. # 0-1 for Better convergence
data = data.reshape([width * height]) # tensorflow matmul needs flattened matrices wtf
batch.append(list(data))
classe = (ord(image_name[0]) - 48) % 32# -> 0=0 17 for A, 10 for z ;)
labels.append(dense_to_one_hot(classe,num_classes))
if len(batch) >= batch_size:
yield batch, labels
batch = [] # Reset for next batch
labels = []

def spectro_batch(batch_size=10):
return spectro_batch_generator(batch_size)
67 changes: 67 additions & 0 deletions test-deep-net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#! /usr/bin/python35

import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
import speech_data
from speech_data import Source, Target

n_classes = 32
hm_epochs = 5
batch_size = 64
chunk_size = 512
n_chunks = 512
rnn_size = 256
n_layers = 3
width = 512
n_steps = 256

batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter)

x = tf.placeholder('float', [None, n_chunks, chunk_size])
y = tf.placeholder('float')

def recurrent_neural_network(x):
layer = {'weights': tf.Variable(tf.random_normal([rnn_size, n_classes])), 'biases':tf.Variable(tf.random_normal([n_classes]))}

x = tf.transpose(x, [1,0,2])
x = tf.reshape(x, [-1, chunk_size])
x = tf.split(0, n_chunks, x)

lstm_cell = rnn_cell.BasicLSTMCell(rnn_size)
lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers)

outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)

output = tf.matmul(outputs[-1], layer['weights']) + layer['biases']

return output

def train_neural_network(x):
prediction = recurrent_neural_network(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y))
optimizer = tf.train.AdamOptimizer().minimize(cost)

saver = tf.train.Saver()

with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
saver.restore(sess, "model.ckpt")

correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

avg = 0

for _ in range(20):
test_X, test_Y = next(batch)
test_X = np.array(test_X)
temp = accuracy.eval({x:test_X.reshape((-1, n_chunks, chunk_size)), y:test_Y})
print("Accuracy:", temp)
avg+= temp

avg /= 20
print("Average accuracy: ", avg)

if __name__ == '__main__':
train_neural_network(x)

0 comments on commit 6239076

Please sign in to comment.