-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun-extract-rels.sh
executable file
·257 lines (234 loc) · 6.09 KB
/
run-extract-rels.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#! /bin/bash
progname=`basename $0`
progdir=`dirname $0`
cmdline="$@"
export LC_ALL=C
# Detect subprocess errors also in non-final components of pipelines
set -o pipefail
shortopts="hc:f:r:o:v"
longopts="help,corpus-name:,input-fields:,relation-map:,output-dir:,decode-input,optimize-memory,no-tar,no-archive,keep-temp-files,verbose"
corpus_name=
input_fields=
relation_map=
input=
output_dir=.
decode_input=
optimize_memory=
keep_temp_files=
verbose=
make_archive=1
. $progdir/korp-lib.sh
extract_rels_opts="--output-type=new-strings --word-form-pair-type=baseform --ignore-unknown-relations --temp-dir=$tmpdir"
usage () {
cat <<EOF
Usage: $progname [options] < input.vrt
Extract dependency relations from VRT files to TSV files for the Korp
word picture.
This is a wrapper script for vrt-extract-relations.py. This script
sorts the output tables outside the Python script, produces a tar
archive of the relations files and optionally writes log output.
Options:
-h, --help
-c, --corpus-name CORPUS
-f, --input-fields FIELDLIST
-r, --relation-map FILE
-o, --output-dir DIR
--decode-input
--optimize-memory
--no-tar, --no-archive
--keep-temp-files
-v, --verbose
EOF
exit 0
}
# Process options
while [ "x$1" != "x" ] ; do
case "$1" in
-h | --help )
usage
;;
-c | --corpus-name )
corpus_name=$2
shift
;;
-f | --input-fields )
input_fields=$2
shift
;;
-r | --relation-map )
relation_map=$2
shift
;;
-i | --input )
# FIXME: This is unimplemented; what was the original idea?
input=$2
shift
;;
-o | --output-dir )
output_dir=$2
shift
;;
--decode-input )
decode_input=1
;;
--optimize-memory )
optimize_memory=1
extract_rels_opts="$extract_rels_opts --raw-output"
;;
--keep-temp-files )
keep_temp_files=1
cleanup_on_exit=
;;
--no-tar | --no-archive )
make_archive=
;;
-v | --verbose )
verbose=1
;;
-- )
shift
;;
--* )
warn "Unrecognized option: $1"
;;
* )
break
;;
esac
shift
done
if [ "x$corpus_name" = "x" ]; then
error "Please specify corpus name with --corpus-name"
fi
if [ "x$input_fields" = x ]; then
error "Please specify input field names with --input-fields"
fi
if [ "x$relation_map" = x ]; then
error "Please specify relation map file with --relation-map"
fi
tempdir_usage () {
printf "Tempdir usage: "
du -sh $tmpfile_dir |
cut -d' ' -f1
}
check_subprocess_error () {
if [ "$1" != "0" ]; then
error "Aborting because of an error in a subprocess"
fi
}
sort_and_gzip () {
for f in "$@"; do
mv $f $f.unsorted
sort_opts=
case $f in
*_rels.tsv | *_rels_sentences.tsv | *_rels_strings.tsv )
sort_opts=-n
;;
esac
sort $sort_opts $f.unsorted | gzip > $f.gz
check_subprocess_error "$?"
done
}
process_raw_output () {
# Tab, to be used in sed expressions for better compatibility
# than \t
tab=' '
base_name=$tmpfile_dir/${corpus_name}_rels
for reltype in head_rel dep_rel; do
sort -n ${base_name}_${reltype}.raw.tsv |
uniq -c |
sed -e "s/^ *\([0-9]\+\) \(.*\)/\2${tab}\1/" |
gzip > ${base_name}_${reltype}.tsv.gz
check_subprocess_error "$?"
done
fifo=$tmpfile_dir/rel_ids.fifo
rel_ids_fname=$tmpfile_dir/rel_ids.tsv
mkfifo $fifo
(
sort -nr |
sed -e "s/^ *\([0-9]\+\)${tab} *[0-9]\+ \([^ ${tab}]\+\).*/\2${tab}\1/" < $fifo |
sort -t"${tab}" -k1,1 > $rel_ids_fname
) &
fifo_pid=$!
sort ${base_name}.raw.tsv |
uniq -c |
cat -n |
tee $fifo |
sed -e "s/^ *\([0-9]\+\)${tab} *\([0-9]\+\) \([^${tab}]\+\)${tab}\([^${tab}]\+${tab}[^${tab}]\+${tab}[^${tab}]\+\)/\1${tab}\4${tab}\2/" |
gzip > ${base_name}.tsv.gz
check_subprocess_error "$?"
wait $fifo_pid
check_subprocess_error "$?"
sort -t"${tab}" -k1,1 ${base_name}_sentences.raw.tsv |
join -t"${tab}" -j1 -o '2.2 1.2 1.3 1.4' - $rel_ids_fname |
sort -n |
gzip > ${base_name}_sentences.tsv.gz
check_subprocess_error "$?"
sort_and_gzip ${base_name}_rel.tsv ${base_name}_strings.tsv
}
preprocess_input () {
if [ "x$decode_input" != x ]; then
vrt_decode_special_chars --no-xml-entities
else
cat
fi
}
hostenv=`get_host_env`
verbose echo Run: $0 "$cmdline"
verbose echo Corpus: $corpus_name
verbose echo_timestamp Start
rels_tar=${corpus_name}_rels.tar
tmpfile_dir=$tmp_prefix.work
if [ "x$make_archive" = x ] || [ ! -e $output_dir/$rels_tar ]; then
if [ "x$hostenv" = "xtaito" ]; then
module load python-env/2.7.6 &> /dev/null
fi
mkdir -p $output_dir $tmpfile_dir
verbose echo_timestamp vrt-extract-relations
preprocess_input |
$progdir/vrt-extract-relations.py \
--output-prefix "$tmpfile_dir/${corpus_name}_rels" \
--input-fields "$input_fields" \
--relation-map "$relation_map" \
$extract_rels_opts
verbose subproc_times
# --sort --compress=gzip --temporary-files
# Sorting and compressing files within vrt-extract-relations.py
# often seems to leave the rels_sentences file incomplete. Why?
verbose tempdir_usage
if [ "x$optimize_memory" != x ]; then
verbose echo_timestamp Postprocess: raw output
process_raw_output
else
verbose echo_timestamp Postprocess: sort and gzip
sort_and_gzip $tmpfile_dir/*.tsv
fi
verbose subproc_times
verbose tempdir_usage
verbose echo_timestamp tar
real_output_dir=$output_dir
output_dir_firstchar=${output_dir:0:1}
if [ "x$output_dir_firstchar" != x/ ] &&
[ "x$output_dir_firstchar" != x~ ]
then
real_output_dir=$(pwd)/$output_dir
fi
# tar cpf $rels_tar -C $output_dir --wildcards \*_rels\*.gz
# Wildcards do not seem to work above in tar even with --wildcards. Why?
(
cd $tmpfile_dir
if [ "x$make_archive" != x ]; then
tar cpf $real_output_dir/$rels_tar --wildcards \
${corpus_name}_rels*.tsv.gz
ensure_perms $real_output_dir/$rels_tar
else
ensure_perms ${corpus_name}_rels*.tsv.gz
mv ${corpus_name}_rels*.tsv.gz $real_output_dir
fi
)
verbose subproc_times
if [ "x$keep_temp_files" = x ]; then
rm -rf $tmpfile_dir
fi
fi
verbose echo_timestamp End