diff --git a/prepro.py b/prepro.py
old mode 100644
new mode 100755
index aad11c3c..7a4724a8
--- a/prepro.py
+++ b/prepro.py
@@ -13,13 +13,13 @@
 The hdf5 file contains several fields:
 /images is (N,3,256,256) uint8 array of raw image data in RGB format
 /labels is (M,max_length) uint32 array of encoded labels, zero padded
-/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 
+/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
   first and last indices (in range 1..M) of labels for each image
 /label_length stores the length of the sequence for each of the M sequences
 
 The json file has a dict that contains:
 - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
-- an 'images' field that is a list holding auxiliary information for each image, 
+- an 'images' field that is a list holding auxiliary information for each image,
   such as in particular the 'split' it was assigned to.
 """
 
@@ -28,213 +28,231 @@
 import argparse
 from random import shuffle, seed
 import string
+
 # non-standard dependencies:
 import h5py
 import numpy as np
 from scipy.misc import imread, imresize
 
+
 def prepro_captions(imgs):
-  
-  # preprocess all the captions
-  print 'example processed tokens:'
-  for i,img in enumerate(imgs):
-    img['processed_tokens'] = []
-    for j,s in enumerate(img['captions']):
-      txt = str(s).lower().translate(None, string.punctuation).strip().split()
-      img['processed_tokens'].append(txt)
-      if i < 10 and j == 0: print txt
+
+    # preprocess all the captions
+    print('example processed tokens:')
+    for i, img in enumerate(imgs):
+        img['processed_tokens'] = []
+
+        for j, s in enumerate(img['captions']):
+            txt = s.lower()
+            txt = ''.join([x for x in s if x not in string.punctuation])
+            txt = txt.strip().split()
+            if txt:
+                img['processed_tokens'].append(txt)
+                if i < 10 and j == 0:
+                    print(txt)
+
 
 def build_vocab(imgs, params):
-  count_thr = params['word_count_threshold']
-
-  # count up the number of words
-  counts = {}
-  for img in imgs:
-    for txt in img['processed_tokens']:
-      for w in txt:
-        counts[w] = counts.get(w, 0) + 1
-  cw = sorted([(count,w) for w,count in counts.iteritems()], reverse=True)
-  print 'top words and their counts:'
-  print '\n'.join(map(str,cw[:20]))
-
-  # print some stats
-  total_words = sum(counts.itervalues())
-  print 'total words:', total_words
-  bad_words = [w for w,n in counts.iteritems() if n <= count_thr]
-  vocab = [w for w,n in counts.iteritems() if n > count_thr]
-  bad_count = sum(counts[w] for w in bad_words)
-  print 'number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts))
-  print 'number of words in vocab would be %d' % (len(vocab), )
-  print 'number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words)
-
-  # lets look at the distribution of lengths as well
-  sent_lengths = {}
-  for img in imgs:
-    for txt in img['processed_tokens']:
-      nw = len(txt)
-      sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
-  max_len = max(sent_lengths.keys())
-  print 'max length sentence in raw data: ', max_len
-  print 'sentence length distribution (count, number of words):'
-  sum_len = sum(sent_lengths.values())
-  for i in xrange(max_len+1):
-    print '%2d: %10d   %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len)
-
-  # lets now produce the final annotations
-  if bad_count > 0:
-    # additional special UNK token we will use below to map infrequent words to
-    print 'inserting the special UNK token'
-    vocab.append('UNK')
-  
-  for img in imgs:
-    img['final_captions'] = []
-    for txt in img['processed_tokens']:
-      caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
-      img['final_captions'].append(caption)
-
-  return vocab
+    count_thr = params['word_count_threshold']
+
+    # count up the number of words
+    counts = {}
+    for img in imgs:
+        for txt in img['processed_tokens']:
+            for w in txt:
+                counts[w] = counts.get(w, 0) + 1
+
+    cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
+    print('top words and their counts:')
+    print('\n'.join(map(str, cw[:20])))
+
+    # print some stats
+    total_words = sum(counts.values())
+    print('total words: {}'.format(total_words))
+    bad_words = [w for w, n in counts.items() if n <= count_thr]
+    vocab = [w for w, n in counts.items() if n > count_thr]
+    bad_count = sum(counts[w] for w in bad_words)
+    print('number of bad words: {}/{} = {:.2f}%'.format(len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
+    print('number of words in vocab would be {}'.format(len(vocab)))
+    print('number of UNKs: {}/{} = {:.2f}%'.format(bad_count, total_words, bad_count * 100.0 / total_words))
+
+    # lets look at the distribution of lengths as well
+    sent_lengths = {}
+    for img in imgs:
+        for txt in img['processed_tokens']:
+            nw = len(txt)
+            sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
+    max_len = max(sent_lengths.keys())
+    print('max length sentence in raw data: {}'.format(max_len))
+    print('sentence length distribution (count, number of words):')
+    sum_len = sum(sent_lengths.values())
+    for i in range(max_len + 1):
+        print('{:2d}: {:10d}   {}%'.format(i, sent_lengths.get(i, 0), sent_lengths.get(i, 0) * 100.0 / sum_len))
+
+    # lets now produce the final annotations
+    if bad_count:
+        # additional special UNK token we will use below to map infrequent words to
+        print('inserting the special UNK token')
+        vocab.append('UNK')
+
+    for img in imgs:
+        img['final_captions'] = []
+        for txt in img['processed_tokens']:
+            caption = [w if counts.get(w, 0) > count_thr else 'UNK' for w in txt]
+            img['final_captions'].append(caption)
+
+    return vocab
+
 
 def assign_splits(imgs, params):
-  num_val = params['num_val']
-  num_test = params['num_test']
+    num_val = params['num_val']
+    num_test = params['num_test']
 
-  for i,img in enumerate(imgs):
-      if i < num_val:
-        img['split'] = 'val'
-      elif i < num_val + num_test: 
-        img['split'] = 'test'
-      else: 
-        img['split'] = 'train'
+    for i, img in enumerate(imgs):
+        if i < num_val:
+            img['split'] = 'val'
+        elif i < num_val + num_test:
+            img['split'] = 'test'
+        else:
+            img['split'] = 'train'
+
+    print('assigned {} to val, {} to test.'.format(num_val, num_test))
 
-  print 'assigned %d to val, %d to test.' % (num_val, num_test)
 
 def encode_captions(imgs, params, wtoi):
-  """ 
-  encode all captions into one large array, which will be 1-indexed.
-  also produces label_start_ix and label_end_ix which store 1-indexed 
-  and inclusive (Lua-style) pointers to the first and last caption for
-  each image in the dataset.
-  """
-
-  max_length = params['max_length']
-  N = len(imgs)
-  M = sum(len(img['final_captions']) for img in imgs) # total number of captions
-
-  label_arrays = []
-  label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
-  label_end_ix = np.zeros(N, dtype='uint32')
-  label_length = np.zeros(M, dtype='uint32')
-  caption_counter = 0
-  counter = 1
-  for i,img in enumerate(imgs):
-    n = len(img['final_captions'])
-    assert n > 0, 'error: some image has no captions'
-
-    Li = np.zeros((n, max_length), dtype='uint32')
-    for j,s in enumerate(img['final_captions']):
-      label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
-      caption_counter += 1
-      for k,w in enumerate(s):
-        if k < max_length:
-          Li[j,k] = wtoi[w]
-
-    # note: word indices are 1-indexed, and captions are padded with zeros
-    label_arrays.append(Li)
-    label_start_ix[i] = counter
-    label_end_ix[i] = counter + n - 1
-    
-    counter += n
-  
-  L = np.concatenate(label_arrays, axis=0) # put all the labels together
-  assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
-  assert np.all(label_length > 0), 'error: some caption had no words?'
-
-  print 'encoded captions to array of size ', `L.shape`
-  return L, label_start_ix, label_end_ix, label_length
+    """
+    encode all captions into one large array, which will be 1-indexed.
+    also produces label_start_ix and label_end_ix which store 1-indexed
+    and inclusive (Lua-style) pointers to the first and last caption for
+    each image in the dataset.
+    """
+
+    max_length = params['max_length']
+    N = len(imgs)
+    M = sum(len(img['final_captions']) for img in imgs)  # total number of captions
+
+    label_arrays = []
+    label_start_ix = np.zeros(N, dtype='uint32')  # note: these will be one-indexed
+    label_end_ix = np.zeros(N, dtype='uint32')
+    label_length = np.zeros(M, dtype='uint32')
+    caption_counter = 0
+    counter = 1
+    for i, img in enumerate(imgs):
+        n = len(img['final_captions'])
+        assert n > 0, 'error: some image has no captions {}'.format(img)
+
+        Li = np.zeros((n, max_length), dtype='uint32')
+        for j, s in enumerate(img['final_captions']):
+            label_length[caption_counter] = min(max_length, len(s))  # record the length of this sequence
+            caption_counter += 1
+            for k, w in enumerate(s):
+                if k < max_length:
+                    Li[j, k] = wtoi[w]
+
+        # note: word indices are 1-indexed, and captions are padded with zeros
+        label_arrays.append(Li)
+        label_start_ix[i] = counter
+        label_end_ix[i] = counter + n - 1
+
+        counter += n
+
+    L = np.concatenate(label_arrays, axis=0)  # put all the labels together
+    assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
+    assert np.all(label_length > 0), 'error: some caption had no words?'
+
+    print('encoded captions to array of size {!r}'.format(L.shape))
+    return L, label_start_ix, label_end_ix, label_length
+
 
 def main(params):
 
-  imgs = json.load(open(params['input_json'], 'r'))
-  seed(123) # make reproducible
-  shuffle(imgs) # shuffle the order
-
-  # tokenization and preprocessing
-  prepro_captions(imgs)
-
-  # create the vocab
-  vocab = build_vocab(imgs, params)
-  itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
-  wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
-
-  # assign the splits
-  assign_splits(imgs, params)
-  
-  # encode captions in large arrays, ready to ship to hdf5 file
-  L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
-
-  # create output h5 file
-  N = len(imgs)
-  f = h5py.File(params['output_h5'], "w")
-  f.create_dataset("labels", dtype='uint32', data=L)
-  f.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
-  f.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
-  f.create_dataset("label_length", dtype='uint32', data=label_length)
-  dset = f.create_dataset("images", (N,3,256,256), dtype='uint8') # space for resized images
-  for i,img in enumerate(imgs):
-    # load the image
-    I = imread(os.path.join(params['images_root'], img['file_path']))
-    try:
-        Ir = imresize(I, (256,256))
-    except:
-        print 'failed resizing image %s - see http://git.io/vBIE0' % (img['file_path'],)
-        raise
-    # handle grayscale input images
-    if len(Ir.shape) == 2:
-      Ir = Ir[:,:,np.newaxis]
-      Ir = np.concatenate((Ir,Ir,Ir), axis=2)
-    # and swap order of axes from (256,256,3) to (3,256,256)
-    Ir = Ir.transpose(2,0,1)
-    # write to h5
-    dset[i] = Ir
-    if i % 1000 == 0:
-      print 'processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N)
-  f.close()
-  print 'wrote ', params['output_h5']
-
-  # create output json file
-  out = {}
-  out['ix_to_word'] = itow # encode the (1-indexed) vocab
-  out['images'] = []
-  for i,img in enumerate(imgs):
-    
-    jimg = {}
-    jimg['split'] = img['split']
-    if 'file_path' in img: jimg['file_path'] = img['file_path'] # copy it over, might need
-    if 'id' in img: jimg['id'] = img['id'] # copy over & mantain an id, if present (e.g. coco ids, useful)
-    
-    out['images'].append(jimg)
-  
-  json.dump(out, open(params['output_json'], 'w'))
-  print 'wrote ', params['output_json']
+    imgs = json.load(open(params['input_json'], 'r'))
+    seed(123)  # make reproducible
+    shuffle(imgs)  # shuffle the order
+
+    # tokenization and preprocessing
+    prepro_captions(imgs)
+
+    imgs = [i for i in imgs if i['processed_tokens']]
+
+    # create the vocab
+    vocab = build_vocab(imgs, params)
+    itow = {i + 1: w for i, w in enumerate(vocab)}  # a 1-indexed vocab translation table
+    wtoi = {w: i + 1 for i, w in enumerate(vocab)}  # inverse table
+
+    # assign the splits
+    assign_splits(imgs, params)
+
+    # encode captions in large arrays, ready to ship to hdf5 file
+    L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
+
+    # create output h5 file
+    N = len(imgs)
+    f = h5py.File(params['output_h5'], "w")
+    f.create_dataset("labels", dtype='uint32', data=L)
+    f.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
+    f.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
+    f.create_dataset("label_length", dtype='uint32', data=label_length)
+    dset = f.create_dataset("images", (N, 3, 256, 256), dtype='uint8')  # space for resized images
+
+    for i, img in enumerate(imgs):
+        # load the image
+        I = imread(os.path.join(params['images_root'], img['file_path']))
+        try:
+            Ir = imresize(I, (256, 256))
+        except:
+            print('failed resizing image {} - see http://git.io/vBIE0'.format(img['file_path']))
+            raise
+        # handle grayscale input images
+        if len(Ir.shape) == 2:
+            Ir = Ir[:, :, np.newaxis]
+            Ir = np.concatenate((Ir, Ir, Ir), axis=2)
+        # and swap order of axes from (256,256,3) to (3,256,256)
+        Ir = Ir.transpose(2, 0, 1)
+        # write to h5
+        dset[i] = Ir
+        if i % 1000 == 0:
+            print('processing {}/{} ({:.2f}% done)'.format(i, N, i * 100.0 / N))
+    f.close()
+    print('wrote {}'.format(params['output_h5']))
+
+    # create output json file
+    out = {}
+    out['ix_to_word'] = itow  # encode the (1-indexed) vocab
+    out['images'] = []
+
+    for i, img in enumerate(imgs):
+
+        jimg = {}
+        jimg['split'] = img['split']
+        if 'file_path' in img:
+            jimg['file_path'] = img['file_path']  # copy it over, might need
+        if 'id' in img:
+            jimg['id'] = img['id']  # copy over & mantain an id, if present (e.g. coco ids, useful)
+
+        out['images'].append(jimg)
+
+    json.dump(out, open(params['output_json'], 'w'))
+    print('wrote {}'.format(params['output_json']))
 
 if __name__ == "__main__":
 
-  parser = argparse.ArgumentParser()
-
-  # input json
-  parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
-  parser.add_argument('--num_val', required=True, type=int, help='number of images to assign to validation data (for CV etc)')
-  parser.add_argument('--output_json', default='data.json', help='output json file')
-  parser.add_argument('--output_h5', default='data.h5', help='output h5 file')
-  
-  # options
-  parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
-  parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
-  parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
-  parser.add_argument('--num_test', default=0, type=int, help='number of test images (to withold until very very end)')
-
-  args = parser.parse_args()
-  params = vars(args) # convert to ordinary dict
-  print 'parsed input parameters:'
-  print json.dumps(params, indent = 2)
-  main(params)
+    parser = argparse.ArgumentParser()
+
+    # input json
+    parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
+    parser.add_argument('--num_val', required=True, type=int, help='number of images to assign to validation data (for CV etc)')
+    parser.add_argument('--output_json', default='data.json', help='output json file')
+    parser.add_argument('--output_h5', default='data.h5', help='output h5 file')
+
+    # options
+    parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
+    parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
+    parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
+    parser.add_argument('--num_test', default=0, type=int, help='number of test images (to withold until very very end)')
+
+    args = parser.parse_args()
+    params = vars(args)  # convert to ordinary dict
+    print('parsed input parameters:')
+    print(json.dumps(params, indent=2))
+    main(params)