Skip to content

Commit

Permalink
dump-dataset, more options
Browse files Browse the repository at this point in the history
  • Loading branch information
albertz committed Feb 22, 2018
1 parent 0050840 commit f7edfed
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 4 deletions.
1 change: 1 addition & 0 deletions GeneratingDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,7 @@ def __init__(self, path, prefix, bpe, audio, partition_epoch=None, fixed_random_
self.partition_epoch = partition_epoch
self.transs = self._collect_trans()
self._reference_seq_order = sorted(self.transs.keys())
self.init_seq_order()

def _collect_trans(self):
transs = {}
Expand Down
31 changes: 27 additions & 4 deletions tools/dump-dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os
import sys
import time

my_dir = os.path.dirname(os.path.abspath(__file__))
returnn_dir = os.path.dirname(my_dir)
Expand All @@ -14,7 +15,7 @@
import argparse
import numpy
from better_exchook import pretty_print
from Util import Stats
from Util import Stats, hms
import Util


Expand Down Expand Up @@ -66,25 +67,44 @@ def dump_dataset(dataset, options):
print("Dump files: %r*%r" % (options.dump_prefix, options.dump_postfix), file=log.v3)
elif options.type == "stdout":
print("Dump to stdout", file=log.v3)
elif options.type == "print_shape":
print("Dump shape to stdout", file=log.v3)
elif options.type == "plot":
print("Plot.", file=log.v3)
elif options.type == "null":
print("No dump.")
else:
raise Exception("unknown dump option type %r" % options.type)

start_time = time.time()
stats = Stats() if (options.stats or options.dump_stats) else None
seq_len_stats = {key: Stats() for key in dataset.get_data_keys()}
seq_idx = options.startseq
if options.endseq < 0:
options.endseq = float("inf")
while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq:
dataset.load_seqs(seq_idx, seq_idx + 1)
complete_frac = dataset.get_complete_frac(seq_idx)
start_elapsed = time.time() - start_time
try:
num_seqs_s = str(dataset.num_seqs)
except NotImplementedError:
try:
num_seqs_s = "~%i" % dataset.estimated_num_seqs
except TypeError: # a number is required, not NoneType
num_seqs_s = "?"
progress = "%i/%s (%.02f%%)" % (seq_idx, num_seqs_s, complete_frac * 100)
if complete_frac > 0:
total_time_estimated = start_elapsed / complete_frac
remaining_estimated = total_time_estimated - start_elapsed
progress += " (%s)" % hms(remaining_estimated)
data = dataset.get_data(seq_idx, options.key)
if options.type == "numpy":
numpy.savetxt("%s%i.data%s" % (options.dump_prefix, seq_idx, options.dump_postfix), data)
elif options.type == "stdout":
print("seq %i data:" % seq_idx, pretty_print(data))
print("seq %s data:" % progress, pretty_print(data))
elif options.type == "print_shape":
print("seq %s data shape:" % progress, data.shape)
elif options.type == "plot":
plot(data)
for target in dataset.get_target_list():
Expand All @@ -93,17 +113,20 @@ def dump_dataset(dataset, options):
numpy.savetxt("%s%i.targets.%s%s" % (options.dump_prefix, seq_idx, target, options.dump_postfix), targets, fmt='%i')
elif options.type == "stdout":
print("seq %i target %r:" % (seq_idx, target), pretty_print(targets))
elif options.type == "print_shape":
print("seq %i target %r shape:" % (seq_idx, target), targets.shape)
seq_len = dataset.get_seq_length(seq_idx)
for key in dataset.get_data_keys():
seq_len_stats[key].collect([seq_len[key]])
if stats:
stats.collect(data)
if options.type == "null":
Util.progress_bar_with_time(dataset.get_complete_frac(seq_idx))
Util.progress_bar_with_time(complete_frac)

seq_idx += 1

print("Done. More seqs which we did not dumped: %s" % dataset.is_less_than_num_seqs(seq_idx), file=log.v1)
print("Done. Total time %s. More seqs which we did not dumped: %s" % (
hms(time.time() - start_time), dataset.is_less_than_num_seqs(seq_idx)), file=log.v1)
for key in dataset.get_data_keys():
seq_len_stats[key].dump(stream_prefix="Seq-length %r " % key, stream=log.v2)
if stats:
Expand Down

0 comments on commit f7edfed

Please sign in to comment.