-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_dataset.py
103 lines (76 loc) · 3.51 KB
/
split_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
""" split_dataset.py
Splits dataset into train and dev sets
Usage:
split_dataset.py <root_dir> <output_dir>
[--train_portion=0.7]
[--max_files_to_process=0]
Options:
--train_portion=0.7 how much of the data you want to use for training [default: 0.7]
--max_files_to_process=0 how many input files to process [default: 0 => all files]
"""
import random
import shutil
class DatasetSplitter(object):
def __init__(self, dirs):
self.dirs = dirs
def split_data(self, train_portion, max_files_to_process=0):
all_files = []
for root, dirs, files in os.walk(self.dirs['mfcc'], topdown=False):
for name in files:
if not name.endswith('.mfcc.hdf5'): continue
mfcc = os.path.join(root, name)
spec = mfcc.replace(self.dirs['mfcc'], self.dirs['spec']).replace('wav.mfcc', 'wav.spec')
assert os.path.isfile(mfcc)
assert os.path.isfile(spec)
all_files += [(spec, mfcc)]
random.shuffle(all_files)
if max_files_to_process != 0 and len(all_files)>max_files_to_process:
all_files = all_files[:max_files_to_process]
num_train = int(len(all_files) * train_portion)
print num_train, 'out of', len(all_files), 'files will be used for training'
for spec, mfcc in all_files[:num_train]:
dest_spec = spec.replace(self.dirs['spec'], self.dirs['train/spec'])
dest_mfcc = mfcc.replace(self.dirs['mfcc'], self.dirs['train/mfcc'])
if not os.path.isdir(os.path.dirname(dest_spec)): os.makedirs(os.path.dirname(dest_spec))
if not os.path.isdir(os.path.dirname(dest_mfcc)): os.makedirs(os.path.dirname(dest_mfcc))
os.symlink(spec, dest_spec)
os.symlink(mfcc, dest_mfcc)
#shutil.copy(spec, dest_spec)
#shutil.copy(mfcc, dest_mfcc)
for spec, mfcc in all_files[num_train:]:
dest_spec = spec.replace(self.dirs['spec'], self.dirs['dev/spec'])
dest_mfcc = mfcc.replace(self.dirs['mfcc'], self.dirs['dev/mfcc'])
if not os.path.isdir(os.path.dirname(dest_spec)): os.makedirs(os.path.dirname(dest_spec))
if not os.path.isdir(os.path.dirname(dest_mfcc)): os.makedirs(os.path.dirname(dest_mfcc))
os.symlink(spec, dest_spec)
os.symlink(mfcc, dest_mfcc)
#shutil.copy(spec, dest_spec)
#shutil.copy(mfcc, dest_mfcc)
if __name__ == '__main__':
import os
from docopt import docopt
from pprint import pprint
args = docopt(__doc__)
print 'User arguments'
pprint(args)
train_portion = float(args['--train_portion'])
max_files_to_process = int(args['--max_files_to_process'])
root = args['<root_dir>']
out_dir = args['<output_dir>']
dirs = {}
random.seed(1001)
for d in ['spec', 'mfcc']:
dirs[d] = os.path.abspath(os.path.join(root, d))
for d in ['train', 'dev']:
d = os.path.join(out_dir, d)
if os.path.isdir(d):
print 'removing directory', d, 'for resampling'
shutil.rmtree(d)
for d in ['train', 'dev', 'train/spec', 'train/mfcc', 'dev/spec', 'dev/mfcc']:
dirs[d] = os.path.join(out_dir, d)
if not os.path.isdir(dirs[d]):
print 'making directory', dirs[d]
os.makedirs(dirs[d])
splitter = DatasetSplitter(dirs)
splitter.split_data(train_portion, max_files_to_process)