-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathconvert_msmarco_pass.sh
executable file
·49 lines (37 loc) · 1.38 KB
/
convert_msmarco_pass.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/bin/bash -e
# The main script to convert MSMARCO passage collection
# It is called after download_msmarco_pass.sh
source ./data_convert/common_conv.sh
checkVarNonEmpty "ANSWER_FILE_JSONL"
checkVarNonEmpty "QUESTION_FILE_JSONL"
checkVarNonEmpty "inputDataDir"
BERT_TOK_OPT=" --bert_tokenize"
for part in pass train dev eval dev.small eval.small test2019 test2020 ; do
mkdir -p $inputDataDir/$part
done
for year in 2019 2020 ; do
python -u ./data_convert/msmarco/convert_queries.py \
$BERT_TOK_OPT \
--input "$src/msmarco-test${year}-queries.tsv" \
--output "$inputDataDir/test${year}/$QUESTION_FILE_JSONL"
done
python -u ./data_convert/msmarco/convert_pass.py \
$BERT_TOK_OPT \
--input "$src/collection.tsv.gz" \
--output "$inputDataDir/pass/${ANSWER_FILE_JSONL}.gz"
for part in train dev dev.small eval eval.small ; do
# eval has no qrels for some reason
if [ "$part" != "eval" -a "$part" != "eval.small" ] ; then
cp "$src/qrels.$part.tsv" "$inputDataDir/$part/$QREL_FILE"
fi
python -u ./data_convert/msmarco/convert_queries.py \
$BERT_TOK_OPT \
--input "$src/queries.$part.tsv" \
--output "$inputDataDir/$part/$QUESTION_FILE_JSONL"
done
cp "$src/2019qrels-pass.txt" "$inputDataDir/test2019/$QREL_FILE"
cp "$src/2020qrels-pass.txt" "$inputDataDir/test2020/$QREL_FILE"
cd $inputDataDir/
mv eval.small eval_official
mv dev.small dev_official
cd -