-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
red4711
committed
Dec 19, 2016
1 parent
f971ec1
commit 6239076
Showing
5 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#Program overview | ||
This github contains two different python script: one to train the neural network model and one to test it against a given data sets. | ||
The neural network is a recurrent neural network (LTSM) of 3 layer each with 256 nodes. | ||
The network takes in a 256^2 sized array which represents an equivalent spectrogram of the spoken word audio file. The data is already preprocessed. | ||
The neural network outputs the first letter of the letter. The resulted accuracy of the best neural network configuration is 40%. | ||
|
||
|
||
Program dependencies: Tensorflow and scikit-image | ||
|
||
To run the predictions on a trained model: run test-deep-net.py | ||
Outputs will be averages for N number of batches | ||
|
||
To run training algorithm: | ||
You will need to acquire the data set at https://www.dropbox.com/s/2ff0x8z60bjjz7b/spoken_words.tar?dl=0 | ||
Extract all the data into a folder call spoken_words relative to your current directory of deep-net.py | ||
run deep-net.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#! /usr/bin/python35 | ||
|
||
import tensorflow as tf | ||
from tensorflow.python.ops import rnn, rnn_cell | ||
from tensorflow.examples.tutorials.mnist import input_data | ||
import numpy as np | ||
from scipy.misc import imread, imresize | ||
import matplotlib.pyplot as plt | ||
import speech_data | ||
from speech_data import Source, Target | ||
|
||
n_classes = 32 | ||
hm_epochs = 5 | ||
batch_size = 64 | ||
chunk_size = 512 | ||
n_chunks = 512 | ||
rnn_size = 256 | ||
n_layers = 3 | ||
width = 512 | ||
n_steps = 256 | ||
learning_rate = 0.001 | ||
|
||
batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter) | ||
|
||
with tf.name_scope('X'): | ||
x = tf.placeholder('float', [None, n_chunks, chunk_size]) | ||
with tf.name_scope('Observed_Values'): | ||
y = tf.placeholder('float') | ||
|
||
def recurrent_neural_network(x): | ||
with tf.name_scope('Weights'): | ||
weights = tf.Variable(tf.random_normal([rnn_size, n_classes])) | ||
with tf.name_scope('Bias'): | ||
bias = tf.Variable(tf.random_normal([n_classes])) | ||
|
||
layer = {'weights': weights, 'biases': bias} | ||
|
||
with tf.name_scope('Input'): | ||
x = tf.transpose(x, [1,0,2]) | ||
x = tf.reshape(x, [-1, chunk_size]) | ||
x = tf.split(0, n_chunks, x) | ||
|
||
with tf.name_scope('Layers'): | ||
lstm_cell = rnn_cell.BasicLSTMCell(rnn_size) | ||
lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers) | ||
|
||
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32) | ||
|
||
with tf.name_scope('Prediction'): | ||
output = tf.matmul(outputs[-1], layer['weights']) + layer['biases'] | ||
|
||
return output | ||
|
||
|
||
def train_neural_network(x): | ||
prediction = recurrent_neural_network(x) | ||
with tf.name_scope('Cost'): | ||
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y)) | ||
with tf.name_scope('Optimizer'): | ||
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) | ||
|
||
with tf.name_scope('Accuracy'): | ||
correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1)) | ||
accuracy = tf.reduce_mean(tf.cast(correct, 'float')) | ||
|
||
with tf.name_scope('Summaries'): | ||
tf.scalar_summary("cost", cost) | ||
tf.scalar_summary("accuracy", accuracy) | ||
|
||
summary_op = tf.merge_all_summaries() | ||
|
||
with tf.Session() as sess: | ||
sess.run(tf.initialize_all_variables()) | ||
writer = tf.train.SummaryWriter('./Tensorboard/', sess.graph) | ||
|
||
with tf.name_scope('Testing_Data'): | ||
test_X, test_Y = next(batch) | ||
test_X = np.array(test_X) | ||
test_X = test_X.reshape((-1, n_chunks, chunk_size)) | ||
|
||
for epoch in range(hm_epochs): | ||
epoch_loss = 0 | ||
for step in range(n_steps): | ||
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) | ||
run_metadata = tf.RunMetadata() | ||
with tf.name_scope('Training_Data'): | ||
epoch_x, epoch_y = next(batch) | ||
epoch_x = np.array(epoch_x) | ||
epoch_x = epoch_x.reshape((batch_size, n_chunks, chunk_size)) | ||
summary, acc = sess.run([summary_op, optimizer], feed_dict = {x:epoch_x, y:epoch_y}, | ||
options=run_options, run_metadata=run_metadata) | ||
|
||
if (epoch * n_steps + step % 50 == 0): | ||
writer.add_run_metadata(run_metadata, 'step%d' % (epoch * n_steps + step)) | ||
writer.add_summary(summary, epoch * n_steps + step) | ||
|
||
print('Accuracy', accuracy.eval({x:test_X, y:test_Y})) | ||
|
||
if __name__ == '__main__': | ||
train_neural_network(x) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import gzip | ||
import os | ||
import re | ||
import skimage.io # scikit-image | ||
import numpy | ||
import numpy as np | ||
import wave | ||
# import extensions as xx | ||
from random import shuffle | ||
from six.moves import urllib | ||
from six.moves import xrange # pylint: disable=redefined-builtin | ||
|
||
CHUNK = 4096 | ||
test_fraction=0.1 # 10% of data for test / verification | ||
|
||
class Source: # labels | ||
NUMBER_WAVES = 'spoken_numbers_wav.tar' | ||
DIGIT_WAVES = 'spoken_numbers_pcm.tar' | ||
DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar' # 64x64 baby data set, works astonishingly well | ||
NUMBER_IMAGES = 'spoken_numbers.tar' # width=256 height=256 | ||
TEST_INDEX = 'test_index.txt' | ||
TRAIN_INDEX = 'train_index.txt' | ||
|
||
from enum import Enum | ||
class Target(Enum): # labels | ||
digits=1 | ||
speaker=2 | ||
words_per_minute=3 | ||
word_phonemes=4 | ||
word=5#characters=5 | ||
sentence=6 | ||
sentiment=7 | ||
first_letter=8 | ||
|
||
def dense_to_one_hot(batch, batch_size, num_labels): | ||
sparse_labels = tf.reshape(batch, [batch_size, 1]) | ||
indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) | ||
concatenated = tf.concat(1, [indices, sparse_labels]) | ||
concat = tf.concat(0, [[batch_size], [num_labels]]) | ||
output_shape = tf.reshape(concat, [2]) | ||
sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0) | ||
return tf.reshape(sparse_to_dense, [batch_size, num_labels]) | ||
|
||
def dense_to_one_hot(labels_dense, num_classes=10): | ||
"""Convert class labels from scalars to one-hot vectors.""" | ||
return numpy.eye(num_classes)[labels_dense] | ||
|
||
def spectro_batch_generator(batch_size=10,width=64,target=Target.digits): | ||
path="spoken_words" | ||
height = width | ||
batch = [] | ||
labels = [] | ||
if target==Target.digits: num_classes=10 | ||
if target==Target.first_letter: num_classes=32 | ||
files = os.listdir(path) | ||
# shuffle(files) # todo : split test_fraction batch here! | ||
# files=files[0:int(len(files)*(1-test_fraction))] | ||
print("Got %d source data files from %s"%(len(files),path)) | ||
while True: | ||
# print("shuffling source data files") | ||
shuffle(files) | ||
for image_name in files: | ||
if not "_" in image_name: continue # bad !?! | ||
image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32) | ||
data = image / 255. # 0-1 for Better convergence | ||
data = data.reshape([width * height]) # tensorflow matmul needs flattened matrices wtf | ||
batch.append(list(data)) | ||
classe = (ord(image_name[0]) - 48) % 32# -> 0=0 17 for A, 10 for z ;) | ||
labels.append(dense_to_one_hot(classe,num_classes)) | ||
if len(batch) >= batch_size: | ||
yield batch, labels | ||
batch = [] # Reset for next batch | ||
labels = [] | ||
|
||
def spectro_batch(batch_size=10): | ||
return spectro_batch_generator(batch_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#! /usr/bin/python35 | ||
|
||
import tensorflow as tf | ||
from tensorflow.python.ops import rnn, rnn_cell | ||
import numpy as np | ||
import speech_data | ||
from speech_data import Source, Target | ||
|
||
n_classes = 32 | ||
hm_epochs = 5 | ||
batch_size = 64 | ||
chunk_size = 512 | ||
n_chunks = 512 | ||
rnn_size = 256 | ||
n_layers = 3 | ||
width = 512 | ||
n_steps = 256 | ||
|
||
batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter) | ||
|
||
x = tf.placeholder('float', [None, n_chunks, chunk_size]) | ||
y = tf.placeholder('float') | ||
|
||
def recurrent_neural_network(x): | ||
layer = {'weights': tf.Variable(tf.random_normal([rnn_size, n_classes])), 'biases':tf.Variable(tf.random_normal([n_classes]))} | ||
|
||
x = tf.transpose(x, [1,0,2]) | ||
x = tf.reshape(x, [-1, chunk_size]) | ||
x = tf.split(0, n_chunks, x) | ||
|
||
lstm_cell = rnn_cell.BasicLSTMCell(rnn_size) | ||
lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers) | ||
|
||
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32) | ||
|
||
output = tf.matmul(outputs[-1], layer['weights']) + layer['biases'] | ||
|
||
return output | ||
|
||
def train_neural_network(x): | ||
prediction = recurrent_neural_network(x) | ||
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y)) | ||
optimizer = tf.train.AdamOptimizer().minimize(cost) | ||
|
||
saver = tf.train.Saver() | ||
|
||
with tf.Session() as sess: | ||
sess.run(tf.initialize_all_variables()) | ||
saver.restore(sess, "model.ckpt") | ||
|
||
correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1)) | ||
accuracy = tf.reduce_mean(tf.cast(correct, 'float')) | ||
|
||
avg = 0 | ||
|
||
for _ in range(20): | ||
test_X, test_Y = next(batch) | ||
test_X = np.array(test_X) | ||
temp = accuracy.eval({x:test_X.reshape((-1, n_chunks, chunk_size)), y:test_Y}) | ||
print("Accuracy:", temp) | ||
avg+= temp | ||
|
||
avg /= 20 | ||
print("Average accuracy: ", avg) | ||
|
||
if __name__ == '__main__': | ||
train_neural_network(x) |