Inital

red4711 · Dec 19, 2016 · 6239076 · 6239076
1 parent f971ec1
commit 6239076
Show file tree

Hide file tree

Showing 5 changed files with 259 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+#Program overview
+This github contains two different python script: one to train the neural network model and one to test it against a given data sets.
+The neural network is a recurrent neural network (LTSM) of 3 layer each with 256 nodes.
+The network takes in a 256^2 sized array which represents an equivalent spectrogram of the spoken word audio file. The data is already preprocessed.
+The neural network outputs the first letter of the letter. The resulted accuracy of the best neural network configuration is 40%.
+
+
+Program dependencies: Tensorflow and scikit-image
+
+To run the predictions on a trained model: run test-deep-net.py
+	Outputs will be averages for N number of batches
+
+To run training algorithm:
+	You will need to acquire the data set at https://www.dropbox.com/s/2ff0x8z60bjjz7b/spoken_words.tar?dl=0
+	Extract all the data into a folder call spoken_words relative to your current directory of deep-net.py
+	run deep-net.py
diff --git a/deep-net.py b/deep-net.py
@@ -0,0 +1,100 @@
+#! /usr/bin/python35
+
+import tensorflow as tf
+from tensorflow.python.ops import rnn, rnn_cell
+from tensorflow.examples.tutorials.mnist import input_data
+import numpy as np
+from scipy.misc import imread, imresize
+import matplotlib.pyplot as plt
+import speech_data
+from speech_data import Source, Target
+
+n_classes = 32
+hm_epochs = 5
+batch_size = 64
+chunk_size = 512
+n_chunks = 512
+rnn_size = 256
+n_layers = 3
+width = 512
+n_steps = 256
+learning_rate = 0.001
+
+batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter)
+
+with tf.name_scope('X'):
+    x = tf.placeholder('float', [None, n_chunks, chunk_size])
+with tf.name_scope('Observed_Values'):
+    y = tf.placeholder('float')
+
+def recurrent_neural_network(x):
+    with tf.name_scope('Weights'):
+        weights = tf.Variable(tf.random_normal([rnn_size, n_classes]))
+    with tf.name_scope('Bias'):
+        bias = tf.Variable(tf.random_normal([n_classes]))
+
+    layer = {'weights': weights, 'biases': bias}
+
+    with tf.name_scope('Input'):
+        x = tf.transpose(x, [1,0,2])
+        x = tf.reshape(x, [-1, chunk_size])
+        x = tf.split(0, n_chunks, x)
+
+    with tf.name_scope('Layers'):
+        lstm_cell = rnn_cell.BasicLSTMCell(rnn_size)
+        lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers)
+
+        outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
+
+    with tf.name_scope('Prediction'):
+        output = tf.matmul(outputs[-1], layer['weights']) + layer['biases']
+
+    return output
+
+
+def train_neural_network(x):
+    prediction = recurrent_neural_network(x)
+    with tf.name_scope('Cost'):
+        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y))
+    with tf.name_scope('Optimizer'):
+        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
+
+    with tf.name_scope('Accuracy'):
+        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
+        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
+
+    with tf.name_scope('Summaries'):
+        tf.scalar_summary("cost", cost)
+        tf.scalar_summary("accuracy", accuracy)
+
+    summary_op = tf.merge_all_summaries()
+
+    with tf.Session() as sess:
+        sess.run(tf.initialize_all_variables())
+        writer = tf.train.SummaryWriter('./Tensorboard/', sess.graph)
+
+        with tf.name_scope('Testing_Data'):
+            test_X, test_Y = next(batch)
+            test_X = np.array(test_X)
+            test_X = test_X.reshape((-1, n_chunks, chunk_size))
+
+        for epoch in range(hm_epochs):
+            epoch_loss = 0
+            for step in range(n_steps):
+                run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+                run_metadata = tf.RunMetadata()
+                with tf.name_scope('Training_Data'):
+                    epoch_x, epoch_y = next(batch)
+                    epoch_x = np.array(epoch_x)
+                    epoch_x = epoch_x.reshape((batch_size, n_chunks, chunk_size))
+                summary, acc = sess.run([summary_op, optimizer], feed_dict = {x:epoch_x, y:epoch_y},
+                                      options=run_options, run_metadata=run_metadata)
+
+                if (epoch * n_steps + step % 50 == 0):
+                    writer.add_run_metadata(run_metadata, 'step%d' % (epoch * n_steps + step))
+                    writer.add_summary(summary, epoch * n_steps + step)
+
+        print('Accuracy', accuracy.eval({x:test_X, y:test_Y}))
+
+if __name__ == '__main__':
+    train_neural_network(x)
diff --git a/model.ckpt b/model.ckpt
diff --git a/speech_data.py b/speech_data.py
@@ -0,0 +1,76 @@
+import gzip
+import os
+import re
+import skimage.io # scikit-image
+import numpy
+import numpy as np
+import wave
+# import extensions as xx
+from random import shuffle
+from six.moves import urllib
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+CHUNK = 4096
+test_fraction=0.1 # 10% of data for test / verification
+
+class Source:  # labels
+  NUMBER_WAVES = 'spoken_numbers_wav.tar'
+  DIGIT_WAVES = 'spoken_numbers_pcm.tar'
+  DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar'  # 64x64  baby data set, works astonishingly well
+  NUMBER_IMAGES = 'spoken_numbers.tar'  # width=256 height=256
+  TEST_INDEX = 'test_index.txt'
+  TRAIN_INDEX = 'train_index.txt'
+
+from enum import Enum
+class Target(Enum):  # labels
+  digits=1
+  speaker=2
+  words_per_minute=3
+  word_phonemes=4
+  word=5#characters=5
+  sentence=6
+  sentiment=7
+  first_letter=8
+
+def dense_to_one_hot(batch, batch_size, num_labels):
+  sparse_labels = tf.reshape(batch, [batch_size, 1])
+  indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
+  concatenated = tf.concat(1, [indices, sparse_labels])
+  concat = tf.concat(0, [[batch_size], [num_labels]])
+  output_shape = tf.reshape(concat, [2])
+  sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0)
+  return tf.reshape(sparse_to_dense, [batch_size, num_labels])
+
+def dense_to_one_hot(labels_dense, num_classes=10):
+  """Convert class labels from scalars to one-hot vectors."""
+  return numpy.eye(num_classes)[labels_dense]
+
+def spectro_batch_generator(batch_size=10,width=64,target=Target.digits):
+  path="spoken_words"
+  height = width
+  batch = []
+  labels = []
+  if target==Target.digits: num_classes=10
+  if target==Target.first_letter: num_classes=32
+  files = os.listdir(path)
+  # shuffle(files) # todo : split test_fraction batch here!
+  # files=files[0:int(len(files)*(1-test_fraction))]
+  print("Got %d source data files from %s"%(len(files),path))
+  while True:
+    # print("shuffling source data files")
+    shuffle(files)
+    for image_name in files:
+      if not "_" in image_name: continue # bad !?!
+      image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32)
+      data = image / 255.  # 0-1 for Better convergence
+      data = data.reshape([width * height])  # tensorflow matmul needs flattened matrices wtf
+      batch.append(list(data))
+      classe = (ord(image_name[0]) - 48) % 32# -> 0=0  17 for A, 10 for z ;)
+      labels.append(dense_to_one_hot(classe,num_classes))
+      if len(batch) >= batch_size:
+        yield batch, labels
+        batch = []  # Reset for next batch
+        labels = []
+
+def spectro_batch(batch_size=10):
+  return spectro_batch_generator(batch_size)
diff --git a/test-deep-net.py b/test-deep-net.py
@@ -0,0 +1,67 @@
+#! /usr/bin/python35
+
+import tensorflow as tf
+from tensorflow.python.ops import rnn, rnn_cell
+import numpy as np
+import speech_data
+from speech_data import Source, Target
+
+n_classes = 32
+hm_epochs = 5
+batch_size = 64
+chunk_size = 512
+n_chunks = 512
+rnn_size = 256
+n_layers = 3
+width = 512
+n_steps = 256
+
+batch=speech_data.spectro_batch_generator(batch_size, width, target=Target.first_letter)
+
+x = tf.placeholder('float', [None, n_chunks, chunk_size])
+y = tf.placeholder('float')
+
+def recurrent_neural_network(x):
+    layer = {'weights': tf.Variable(tf.random_normal([rnn_size, n_classes])), 'biases':tf.Variable(tf.random_normal([n_classes]))}
+
+    x = tf.transpose(x, [1,0,2])
+    x = tf.reshape(x, [-1, chunk_size])
+    x = tf.split(0, n_chunks, x)
+
+    lstm_cell = rnn_cell.BasicLSTMCell(rnn_size)
+    lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_layers)
+
+    outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
+
+    output = tf.matmul(outputs[-1], layer['weights']) + layer['biases']
+
+    return output
+
+def train_neural_network(x):
+    prediction = recurrent_neural_network(x)
+    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y))
+    optimizer = tf.train.AdamOptimizer().minimize(cost)
+
+    saver = tf.train.Saver()
+
+    with tf.Session() as sess:
+        sess.run(tf.initialize_all_variables())
+        saver.restore(sess, "model.ckpt")
+
+        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
+        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
+
+        avg = 0
+
+        for _ in range(20):
+            test_X, test_Y = next(batch)
+            test_X = np.array(test_X)
+            temp = accuracy.eval({x:test_X.reshape((-1, n_chunks, chunk_size)), y:test_Y})
+            print("Accuracy:", temp)
+            avg+= temp
+
+        avg /= 20
+        print("Average accuracy: ", avg)
+
+if __name__ == '__main__':
+    train_neural_network(x)