-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeak_annotation.py
149 lines (127 loc) · 5.61 KB
/
peak_annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 2 20:10:34 2019
@author: qlj874
Usage:
python peak_annotation.py <path to combined dir> <match_tolerance>
This script categorizes ions from MS2 spectra and collects their intensities.
As input, the MaxQuant output folder is used. Specifically, the msms.txt table
and the peak files (.apl)
The peaks are categorized in
- precursor
- ET-no-D (charge reduced precursors)
- fragments (as identified by MaxQuant)
"""
import pandas as pd
import numpy as np
import os
import sys
import warnings
from tqdm import tqdm
#combined_dir = 'C:/Users/qlj874/Documents/ETnoD/ET_noD_test/combined'
#combined_dir = 'C:/Users/qlj874/Documents/HeLa_NCE_tests/combined'
#match_tolerance = 0.05
combined_dir = sys.argv[1]
match_tolerance = float(sys.argv[2])
def match_float_to_array(x, search_array, tol=match_tolerance):
"""
This function finds the closest match for an float to an array of floats
and returns the match index from search_array if it is below a given
tolerance (in Da) or -1 if it's above the tolerance.
"""
diffs = np.absolute(search_array - x)
min_diff = diffs.min()
min_index = np.where(diffs == min_diff)
if min_diff <= tol:
if len(min_index[0]) != 1:
# if there are multiple matches just take the first one but raise
# a warning
warnings.warn('Multiple matches found, taking the first one')
min_index = min_index[0]
return(int(min_index[0]))
else:
return(-1)
def main():
print('read apl files')
# scan folder for apl files
apl_files = []
for file in os.listdir(combined_dir + '/andromeda/'):
if file.endswith(".apl") and not 'secpep' in file:
apl_files.append(os.path.join(combined_dir + '/andromeda/', file))
# parse the apl file into a list of dicts
# this is a very slow parser that could be optimized
spectra = []
for apl_file in tqdm(apl_files):
with open(apl_file, 'r') as f:
for l in f.readlines():
if l == 'peaklist start\n':
row = {}
mz = []
intens = []
elif l[:2] == 'mz':
row['mz'] = float(l[3:].strip())
elif l[:13] == 'fragmentation':
row['fragmentation'] = l[14:].strip()
elif l[:6] == 'charge':
row['charge'] = int(l[7:].strip())
elif l[:6] == 'header':
x = l.split(' ')
row['raw_file'] = x[1]
row['index'] = int(x[3])
elif l == 'peaklist end\n':
row['mz_ar'] = np.array(mz)
row['int_ar'] = np.array(intens)
spectra.append(row)
elif l == '\n':
continue
else:
vals = [x.strip() for x in l.split('\t')]
mz.append(float(vals[0]))
intens.append(float(vals[1]))
# for each spectrum, annotate precursor intensity and etnoD intensities.
# since MaxQuant performs charge deconvolution on every isotope pattern
# with the assumption that the charge state is purely dependent on added
# protons, charge reduced precursors by etd (electron) can be found like
# this:
# mz*charge - (0, 1, 2, ..., prec_charge - 2)
print('annotate spectra')
for s in tqdm(spectra):
prec = (s['mz'] * s['charge']) - s['charge'] + 1
ind = match_float_to_array(prec, s['mz_ar'])
if ind == -1:
s['prec_int'] = 0
else:
s['prec_int'] = s['int_ar'][ind]
if s['charge'] > 1:
et_no_d = [(s['mz'] * s['charge']) - j for j in range(s['charge'] - 1)]
s['et_no_d_int'] = sum([s['int_ar'][z] for z in [y for y in [match_float_to_array(x, s['mz_ar']) for x in et_no_d] if y != -1]])
else:
s['et_no_d_int'] = np.nan
spectra = pd.DataFrame(spectra)
# if the charge state of an precursor is unknown, MaxQuant copies
# the spectra with different charge annotations.
# Remove those spectra
spectra = spectra[~spectra[['index', 'raw_file']].duplicated(keep=False)]
# now the index is unique for each raw file
spectra = spectra.set_index(['raw_file', 'index'])
# load the identifications
# For simplification, spectra that have second peptides are removed
# Spectra with an empty matches column are also removed
msms = pd.read_table(combined_dir + '/txt/msms.txt')
msms = msms.drop_duplicates(['Raw file', 'Scan number'], keep=False)
msms = msms[~msms['Matches'].isna()]
msms.set_index(['Raw file', 'Scan number'], inplace=True)
msms.index.names = ['raw_file', 'index']
# join dfs
spectra = spectra.join(msms, how='left')
# add up fragment ion intensities
spectra.loc[~spectra['Intensities'].isna(),'fragment_intensity'] = spectra[~spectra['Intensities'].isna()]['Intensities'].apply(lambda x: sum([float(y) for y in x.split(';')]))
spectra['relative_fragmentation'] = spectra['fragment_intensity'] / spectra['prec_int']
spectra['precursor_et_no_d'] = spectra['et_no_d_int'] / spectra['prec_int']
# remove arrays
spectra = spectra.drop(['mz_ar','int_ar'], axis=1)
return(spectra)
spectra = main()
# save file
# the table is quite big and could be reduced to the important columns
spectra.to_csv(combined_dir + '/txt/spectra.txt', sep='\t')