forked from ProteinQure/cbh21-protein-solubility-challenge
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalc_fractions.py
168 lines (147 loc) · 4.2 KB
/
calc_fractions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import Bio.PDB.DSSP as DSSP
import Bio.PDB as PDB
import numpy as np
import glob
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
def get_feats(file):
'''Given a protein name ('file'), will extract features related to amino acid charges'''
alphas=['H','I','G']
betas=['B','E']
p = PDB.PDBParser(QUIET=True)
dssp_tuple = dssp_dict_from_pdb_file(file)
dssp_dict = dssp_tuple[0]
all_residues = list(dssp_dict.keys())
# get total fraction of alpha and beta
alph_frac=0
bet_frac=0
for amino_acid in all_residues:
ss = dssp_dict[amino_acid][1]
if ss in alphas:
alph_frac+=1
if ss in betas:
bet_frac+=1
len_prot=len(dssp_dict)
'''
# looking at beta residues:
tot_bet_bur=0
tot_bet_mod = 0
for amino_acid in all_residues:
ss=dssp_dict[amino_acid][1]
RASA=dssp_dict[amino_acid][2]
if ss in betas:
if RASA<100:
tot_bet_bur+=1
elif RASA<150:
tot_bet_mod += 1
# calculate fraction of buried beta residues, append to list
try:
bet_bur=tot_bet_bur/bet_frac
except:
bet_bur=0
# calculate fraction of moderately buried beta residues, append to list
try:
bet_mod=(tot_bet_mod/bet_frac)
except:
bet_mod=0
# looking at alpha residues:
tot_al_mod = 0
tot_al_exp = 0
for amino_acid in all_residues:
ss = dssp_dict[amino_acid][1]
RASA = dssp_dict[amino_acid][2]
if ss in alphas:
if RASA > 150:
tot_al_exp += 1
elif 100 < RASA < 150:
tot_al_mod += 1
# calculate fraction of moderately buried alpha residues, append to list
try:
al_mod=(tot_al_mod / alph_frac)
except:
al_mod=0
# calculate fraction of moderately buried beta residues, append to list
try:
al_exp=(tot_al_exp / alph_frac)
except:
al_exp='NA'
'''
# calc fraction of each of the 20 amino acid types
aas={}
for amino_acid in all_residues:
aa = dssp_dict[amino_acid][0]
if aa in aas.keys():
aas[aa]+=1
else:
aas[aa]=0
for aa in aas.keys():
aas[aa]=aas[aa]/len_prot
# calc fraction of K minus fraction of R
try:
frac_k_minus_r=aas['K']-aas['R']
except:
try:
frac_k_minus_r = -aas['R']
except:
try:
frac_k_minus_r = aas['K']
except:
frac_k_minus_r = 0
# fraction of negatively charged residues
negs=['D','E']
frac_neg=0
for neg in negs:
try:
frac_neg+=aas[neg]/len_prot
except:
pass
# fraction of positively charged residues
poss=['K','H','R']
frac_pos=0
for pos in poss:
try:
frac_pos+=aas[pos]/len_prot
except:
pass
# fraction of charged residues
charged=['D','E','K','H','R']
frac_charged = 0
for ch in charged:
try:
frac_charged += aas[ch] / len_prot
except:
pass
# fraction of positively minus negatively charged residues
pos_minus_neg=frac_pos-frac_neg
# scores how hydrophobic the surface is (-1 for every hydrophobic amino acid, normalised by protein size)
hydrophobes=['A','V','I','L','M','F','Y','W']
exp_score=0
exp_count=0
for amino_acid in all_residues:
RASA = dssp_dict[amino_acid][2]
if RASA>150:
exp_count+=1
if dssp_dict[amino_acid][0] in hydrophobes:
exp_score+=-1
try:
exp_score=exp_score/exp_count
except:
exp_score=0
fracs=(#bet_bur,
#bet_mod,
#al_mod,
#al_exp,
#aas,
frac_k_minus_r,
frac_neg,
frac_pos,
frac_charged,
pos_minus_neg,
exp_score)
return(fracs)
def feat_list(list_of_proteins):
list_feats=[]
for file in list_of_proteins:
feats=get_feats(file)
list_feats.append(feats)
return(list_feats)
# list_prots=glob.glob("data/training/crystal_structs/*.pdb")