-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathto_utterances.py
48 lines (44 loc) · 1.87 KB
/
to_utterances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from tools import jsonl
from tqdm import tqdm
full_stops = {
'?', '.', '!'
}
with jsonl.JRZ('corpus_staging/manifest.jsonl.gz') as manifest:
num_files = sum(1 for audiofile in manifest)
with jsonl.JRZ('corpus_staging/corpus.tokenized.jsonl.gz') as infile:
pbar = tqdm(infile, total=num_files)
for audiofile in pbar:
pbar.set_postfix(case=audiofile['case_title'])
with open(
audiofile['transcript_pdf_path'] + '.utts.txt',
'w'
) as outfile:
utt_count = 0
current_utt = ''
for i in range(len(audiofile['transcript']['words'])):
word = audiofile['transcript']['words'][i]
if not word['text']:
print('ERR')
exit(1)
window = ' '.join(w['text'] for w in audiofile['transcript']['words'][max(0, i-3):i + 3]).lower()
uninterrupted = ((i == len(audiofile['transcript']['words']) - 1) or ('-' not in audiofile['transcript']['words'][i+1]['text']))
no_punctuated_atoms = all(
punctuated_atom not in window
for punctuated_atom in [
'i . e .',
'e . g .',
'u . s .'
]
)
# we don't want to start next utt with punctuation
if len(current_utt) > 16 and word['text'] in full_stops \
and uninterrupted and no_punctuated_atoms:
current_utt += ' ' + word['text']
outfile.write(current_utt.strip() + '\n')
utt_count += 1
current_utt = ''
else:
current_utt += ' ' + word['text']
if current_utt:
outfile.write(current_utt.strip() + '\n')
utt_count += 1