-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscore.py
239 lines (207 loc) · 9.96 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env python3
"""Imputation challenge scoring script
Author:
Jin Lee ([email protected])
"""
import os
import time
import re
import gc
from score_metrics import Score, normalize_dict
from score_metrics import mse, mseprom, msegene, mseenh, msevar, mse1obs, mse1imp
from score_metrics import gwcorr, gwspear
from db import write_to_db, ScoreDBRecord
from bw_to_npy import load_bed, load_npy, bw_to_dict, dict_to_arr
from logger import log
RE_PATTERN_SUBMISSION_FNAME = r'^C\d\dM\d\d\..*(bw|bigwig|bigWig|BigWig|npy|npz|np)'
def parse_submission_filename(bw_file):
"""Filename should be CXXMYY.bigwig or CXXMYY.bw
"""
basename = os.path.basename(bw_file)
if len(re.findall(RE_PATTERN_SUBMISSION_FNAME, basename)) == 0:
raise Exception('Wrong submission filename. It should be CXXMYY.*bigwig')
cell = basename[0:3]
assay = basename[3:6]
return cell, assay
def score(y_pred_dict, y_true_dict, chroms,
gene_annotations, enh_annotations,
window_size=25, prom_loc=80,
y_var_dict=None):
"""Calculate score
Args:
bootstrapped_label_dict:
{ chr: label }
Dict of labels of a bootstrapped (subsampled) sample
For example of the full original label = [0,1,2,3] with
4-fold bootstrapping without shuffling.
Bootstrapped labels are like the following:
[0,1,2], [0,1,3], [0,2,3], [1,2,3]
We need to score a submission for each bootstrap index
In real score calculation, we shuffle it with fixed
random seed.
"""
# concat all chromosomes
y_pred_dict_norm = normalize_dict(y_pred_dict, chroms)
y_true_dict_norm = normalize_dict(y_true_dict, chroms)
y_pred = dict_to_arr(y_pred_dict, chroms)
y_true = dict_to_arr(y_true_dict, chroms)
y_pred_norm = dict_to_arr(y_pred_dict_norm, chroms)
y_true_norm = dict_to_arr(y_true_dict_norm, chroms)
if y_var_dict is None:
y_var_true = None
else:
y_var_true = dict_to_arr(y_var_dict, chroms)
output = Score(
mse=mse(y_true_norm, y_pred_norm),
gwcorr=gwcorr(y_true, y_pred),
gwspear=gwspear(y_true, y_pred),
mseprom=mseprom(y_true_dict_norm, y_pred_dict_norm, chroms,
gene_annotations,
window_size, prom_loc),
msegene=msegene(y_true_dict_norm, y_pred_dict_norm, chroms,
gene_annotations,
window_size),
mseenh=mseenh(y_true_dict_norm, y_pred_dict_norm, chroms,
enh_annotations,
window_size),
msevar=msevar(y_true_norm, y_pred_norm, var=y_var_true),
mse1obs=mse1obs(y_true_norm, y_pred_norm),
mse1imp=mse1imp(y_true_norm, y_pred_norm),
)
return output
def parse_arguments():
import argparse
import os
py_path = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser(
description='ENCODE Imputation Challenge scoring script. .npy or .npz must be built '
'with a correct --window-size. i.e. containing a value for each bin')
parser.add_argument('pred_npy_or_bw',
help='Submission .npy or .bigwig file to be scored')
parser.add_argument('true_npy_or_bw',
help='Truth .npy or .bigwig file')
parser.add_argument('--download-submissions-from-syn-eval-queue', action='store_true',
help='Download RECEIVED submissions from Synapse '
'evaluation queue.')
parser.add_argument('--var-npy',
help='Truth .npy file filled with a variance for each bin '
'instead of a raw signal value. '
'Variance must be computed over all cell types for a target '
'assay type. This should have a object form of '
'{ chrom: [var1, var2, ...] } length([var, ...]) should match '
'with the number of bins per chromosome. '
'Use build_var_npy.py TRUTH_NPY_CELL1 TRUTH_NPY_CELL2 ... '
'to build such .npy over all cell types for a target assay')
p_score = parser.add_argument_group(
title='Scoring parameters')
p_score.add_argument('--chrom', nargs='+',
default=['all'],
help='List of chromosomes to be combined to be '
'scored. '
'Set as "all" (default) to score for all '
'chromosomes. '
'(e.g. "all" or "chr3 chr21") '
'It should be "all" to write scores to DB file')
p_score.add_argument('--bootstrap-chrom', nargs='*', default=[],
help='Bootstrapped chromosome groups. '
'Delimiter is whitespace for groups and '
'comma(,) in each group. Order is important.'
'e.g. "chr1,chr2 chr1,chrX chr2,chrX" means '
'three groups: (chr1,chr2), (chr1,chrX), (chr2,chrX)')
p_score.add_argument('--gene-annotations',
default=os.path.join(
py_path,
'annot/hg38/gencode.v29.genes.gtf.bed.gz'),
help='Gene annotations BED file')
p_score.add_argument('--enh-annotations',
default=os.path.join(
py_path,
'annot/hg38/F5.hg38.enhancers.bed.gz'),
help='Enhancer annotations BED file ')
p_score.add_argument('--blacklist-file',
default=os.path.join(
py_path,
'annot/hg38/hg38.blacklist.bed.gz'),
help='Blacklist BED file. Bootstrap label will be '
'generated after removing overlapping regions '
'defined in this file.')
p_score.add_argument('--window-size', default=25, type=int,
help='Window size for bigwig in bp')
p_score.add_argument('--prom-loc', default=80, type=int,
help='Promoter location in a unit of window size '
'(--window-size). This is not in bp')
p_score.add_argument('--validated', action='store_true',
help='For validated submissions (not for truth bigwigs) '
'with fixed interval length of 25 and valid '
'chromosome lengths. It will skip interpolation. '
'For truth bigwigs, it is recommended to convert '
'them into npy\'s or npz\'s by using '
'bw_to_npy.py')
#p_score.add_argument('--normalize-with-robust-min-max', action='store_true',
# help='Normalize with robust min max.')
p_out = parser.add_argument_group(
title='Output to file (TSV or DB)')
p_out.add_argument('--db-file',
help='Write metadata/scores to SQLite DB file')
p_meta = parser.add_argument_group(
title='Submission metadata, which will be written to '
'DB together with scores. This will be used for '
'ranking later')
p_meta.add_argument('--team-id', '-t', type=int,
help='Team ID (unique ID from Synapse)')
p_meta.add_argument('--submission-id', '-s', type=int,
help='Submission ID (unique ID from Synapse)')
args = parser.parse_args()
if args.db_file is not None:
if not os.path.exists(args.db_file):
raise ValueError('DB file does not exists')
if args.chrom == ['all']:
args.chrom = ['chr' + str(i) for i in range(1, 23)] + ['chrX']
args.chrom = sorted(args.chrom)
if len(args.bootstrap_chrom) == 0:
args.bootstrap_chrom = [(-1, args.chrom)]
else:
for i, _ in enumerate(args.bootstrap_chrom):
args.bootstrap_chrom[i] = (i, args.bootstrap_chrom[i].split(','))
print(args.bootstrap_chrom)
return args
def main():
args = parse_arguments()
y_pred_dict = bw_to_dict(args.pred_npy_or_bw, args.chrom,
args.window_size, args.blacklist_file)
y_true_dict = bw_to_dict(args.true_npy_or_bw, args.chrom,
args.window_size, args.blacklist_file)
if args.var_npy is None:
y_var_dict = None
elif args.var_npy.endswith(('.npy', '.npz')):
log.info('Opening truth var numpy array file...')
y_var_dict = load_npy(args.var_npy)
else:
raise ValueError('Var true file should be a binned .npy or .npz.')
enh_annotations = load_bed(args.enh_annotations)
gene_annotations = load_bed(args.gene_annotations)
gc.disable()
cell, assay = parse_submission_filename(args.pred_npy_or_bw)
for k, bootstrap_chrom in args.bootstrap_chrom:
log.info('Calculating score for bootstrap {} case...'.format(k))
score_output = score(y_pred_dict, y_true_dict, bootstrap_chrom,
gene_annotations, enh_annotations,
args.window_size, args.prom_loc,
y_var_dict)
s = "\t".join(['bootstrap_'+str(k)]+[str(o) for o in score_output])
print(s)
# write to DB
if args.db_file is not None:
score_db_record = ScoreDBRecord(
args.submission_id,
args.team_id,
os.path.basename(args.pred_npy_or_bw),
cell,
assay,
k,
*score_output)
write_to_db(score_db_record, args.db_file)
gc.collect()
log.info('All done')
if __name__ == '__main__':
main()