forked from CFSAN-Biostatistics/C-WAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeconvolveVariants.py
executable file
·244 lines (198 loc) · 10.3 KB
/
deconvolveVariants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
import numpy as np
import pickle
import csv
import sys
from sklearn.linear_model import LinearRegression
# Variant file name is passed as a command line argument
variantFilename = sys.argv[1]
outputDirectory = sys.argv[2]
variantDBfilename = sys.argv[3]
# Import the pre-processed variant definitions from file
with open(variantDBfilename, 'rb') as file:
uniqueVarNames = pickle.load(file)
uniqueMutationLabels = pickle.load(file)
var2mut = pickle.load(file)
mut2var = pickle.load(file) # Skipped these for efficiency
importantVars = pickle.load(file)
pos2gene = pickle.load(file)
gene2pos = pickle.load(file)
sigMutationMatrix = pickle.load(file)
# Re-compute some useful parameters
NUM_MUTATIONS = len(uniqueMutationLabels)
NUM_VARIANTS = len(uniqueVarNames)
# Parse the variant call file (vcf from samtools or tsv from iVar) and
# retain the statistically highly significant mutations detected.
outfile = open(outputDirectory+"/mutationTable.html", 'w')
with open(variantFilename, encoding='cp1252') as infile:
reader = csv.reader(infile, delimiter="\t")
infile.readline() # Skip header line
# Loop over all detected variants and tabulate their identity.
# Only consider variants that are statistically highly significant
freqVec = np.zeros(NUM_MUTATIONS+1)
prevMutPos = 0
for row in reader:
pValue = float(row[12])
if pValue < 0.01:
mutationPos = int(row[1])
# The length of the "assembled" reads is longer than the reference genome. Ignore the rest.
if mutationPos > 29903:
break
# Avoid duplicate reporting, which happens if multiple overlapping
# ORF's are present in the GFF file provided to iVar.
if prevMutPos == mutationPos:
continue
else:
prevMutPos = mutationPos
# Convert genomic coordinates to CDS coordinates
if len(pos2gene[mutationPos]) >= 3:
# This is in a known CDS
ORFname = pos2gene[mutationPos][2]
# This is in ORF1B which has -1 shift due to ribosome slippage
if ORFname == "ORF1AB" and "ORF1B" in pos2gene[mutationPos]:
mutationPosORF = (mutationPos-gene2pos["ORF1B"][0])//3 + 1
else:
mutationPosORF = (mutationPos-gene2pos[ORFname][0])//3 + 1
else:
# This is in a UTR
ORFname = "NUC"
mutationPosORF = mutationPos
# Parse mutation data from file
refBase = row[2]
altBase = row[3]
altFreq = float(row[10])
refAA = row[16]
altAA = row[18]
# Construct a mutation label based on the reporting format of
# the mutations in constellations
if (len(altAA) == 1) and (ORFname != "NUC") and (refAA != altAA):
# Mutations causing residue substitutions in CDS
mutationLabel = ''.join(
(ORFname, ':', refAA, str(mutationPosORF), altAA))
else:
if (len(altBase) == 1) or (altBase[0] == '+'):
# Insertions (+A +AT etc.) OR substitutions in UTR OR
# silent residue substitutions in CDS
mutationLabel = ''.join(
('NUC:', refBase, str(mutationPos), altBase))
else:
# Deletions in CDS or UTR
mutationLabel = ''.join(
('DEL:', str(mutationPos), ':', str(len(altBase)-1)))
if mutationLabel in uniqueMutationLabels:
mutIdx = uniqueMutationLabels.index(mutationLabel)
freqVec[mutIdx] = altFreq
# Generate html output to be used for the "detected mutations" section
if mutationLabel in uniqueMutationLabels:
compatibleVars = []
for varname in mut2var[mutationLabel]:
compatibleVars.append(
"<a href=\"https://outbreak.info/situation-reports?pango="
+ varname + "\">" + varname + "</a>")
compatibleVars = ', '.join(compatibleVars)
else:
compatibleVars = "None found"
# If present at a significant level, then include among the mutations
# to be shown on the report file.
if altFreq >= 0.05:
outfile.write("<tr>\n")
outfile.write(" <td>%d</td> <td>%s</td> <td>%s</td> <td>%.3f</td> <td nowrap>%.2E</td>"
"<td>%s</td> <td>%s</td>\n"
% (mutationPos, refBase, altBase, altFreq, pValue, mutationLabel, compatibleVars))
outfile.write("</tr>\n")
outfile.close()
# Tle last entry of vector is set to 1000 to serve for normalisation constraint
freqVec[-1] = 1000
# Perform linear regression to estimate variant prevalences
linReg = LinearRegression(positive=True).fit(sigMutationMatrix, freqVec)
variantPercentages = 100*linReg.coef_
R2 = linReg.score(sigMutationMatrix[0:-1, :], freqVec[0:-1])
# Log the deduced variant decomposition to file for future use.
# Format: varName %present per each line
with open(outputDirectory + "/linearDeconvolution_abundance.csv", 'w') as logfile:
for i in range(NUM_VARIANTS):
logfile.write('%s %.1f\n' % (uniqueVarNames[i], variantPercentages[i]))
# Print the name of the most abundant variant and its fraction to the terminal.
highestFreq = np.amax(variantPercentages)
highestIdx = np.argmax(variantPercentages)
print('%.2f %s %.2f' % (highestFreq, uniqueVarNames[highestIdx], R2))
# Check if the presence of a variant is supported or not.
# This is done by verifying if all of the mutations of the variant
# have been observed. If at least one of the characteristic mutations
# is undetected, the variant is said to be not present.
def isVarSupported(varname):
if varname in uniqueVarNames:
varID = uniqueVarNames.index(varname)
relevantMuts = {'supporting': [], 'unsupporting': []}
for i in range(NUM_MUTATIONS):
if sigMutationMatrix[i, varID] == 1:
if freqVec[i] < 0.001:
# At least one of the mutations does not exist (below threshold presence)
relevantMuts['unsupporting'].append(uniqueMutationLabels[i])
else:
# This mutation is present to sufficient degree.
relevantMuts['supporting'].append(uniqueMutationLabels[i])
else:
raise LookupError("The mutation signatures for variant %s not found" % var)
return relevantMuts
outfile = open(outputDirectory + '/VOC-VOIsupportTable.html', 'w')
outfile.write("""<p> Under the assumption that the presence of a variant requires the detection
of all respective mutations of the variant, the characteric mutations which support the presence of
the respective variant are indicated in the respective column of the table. Numbers show the number
of mutations detected, if any, and the number of mutations expected to be present based on the
variant definitions.\n""")
# Build one html table for each genre of variants (VOC/VOI/VUI)
def buildVarSupportTable(varGenre):
outfile.write("<table>\n<tr>\n <td>%s</td>" % varGenre)
for varname in importantVars[varGenre]:
outfile.write("<td>%s</td>" % ("<a href=\"https://outbreak.info/situation-reports?pango=" +
varname + "\">" + varname + "</a>"))
outfile.write("</tr>\n<tr>\n <td>Characteristic mutations <br> detected</td>")
for varname in importantVars[varGenre]:
mutSupport = isVarSupported(varname)
outfile.write("<td>")
outfile.write('(%d of %d)<br>' % (len(mutSupport['supporting']),
len(mutSupport['supporting']) + len(mutSupport['unsupporting'])))
for mut in mutSupport['supporting']:
outfile.write('%s<br>' % mut)
outfile.write("</td>")
outfile.write("\n</tr>\n</table>\n<br>\n")
if len(importantVars["VOC"]) > 0:
buildVarSupportTable("VOC")
if len(importantVars["VOI"]) > 0:
buildVarSupportTable("VOI")
if len(importantVars["VUI"]) > 0:
buildVarSupportTable("VUI")
#################################################################################
# Jaccard index to compute pairwise similarity measured between important variant groups.
def jaccard_index (list1, list2):
intersection = len(list(set(list1).intersection(list2)))
union = len(list1) + len(list2) - intersection
if union == 0:
return -1
else:
return float(intersection) / union
def calculate_variant_jaccards (var1, var2):
muts1 = isVarSupported(var1)
muts2 = isVarSupported(var2)
measured_jaccard = jaccard_index(muts1['supporting'], muts2['supporting'])
expected_jaccard = jaccard_index(muts1['supporting']+muts1['unsupporting'], muts2['supporting']+muts2['unsupporting'])
return (measured_jaccard, expected_jaccard)
# Export an html table showing Jaccard Index between VOCs measured(expected -> href Venn diagram)
outfile.write('\n<br>\n<a href=\"https://en.wikipedia.org/wiki/Jaccard_index">Jaccard Index</a> is a measure of similarity between two sets A and B, reaching the maximum value of 1 if A=B and minimum value of 0 if A ∩ B = {}. In the c(d) representation below, c represents the Jaccard index of the set of mutations that were experimentally detected for this sample as listed above, whereas d refers to the ideal value of the Jaccard index expected from complete genome coverage without any sequencing errors.<br>')
outfile.write("<table>\n<tr>\n")
outfile.write(" <td></td>")
vars2plot = importantVars["VOC"]
for var in vars2plot:
outfile.write("<td>%s</td>" % var)
outfile.write("</tr>\n")
for var1 in vars2plot:
outfile.write('<tr>\n')
outfile.write(' <td>%s</td>' % var1)
for var2 in vars2plot:
(measured_jaccard, expected_jaccard) = calculate_variant_jaccards(var1, var2)
url = 'https://cov-spectrum.org/explore/World/AllSamples/Past6M/variants?pangoLineage=%s&pangoLineage1=%s&analysisMode=CompareEquals&' % (var1,var2)
outfile.write(' <td>%.2f (<a href=\"%s\">%.2f</a>)</td>' % (measured_jaccard, url, expected_jaccard))
outfile.write('</tr>\n')
outfile.write("</table>\n<br>\n")
outfile.close()