-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_db.py
126 lines (106 loc) · 4.78 KB
/
parse_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2018 University of Groningen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from rtp_parser import parse_itp, parse_rtp
from mapping_parser import mapping_parser
from collections import namedtuple
from functools import lru_cache
import os.path as osp
import networkx as nx
import openpyxl
from pysmiles import read_smiles
COLUMNS = namedtuple('Row', 'name skip aa_name cg_name aa_file cg_file '
'mapping_file smiles inchi inchikey comments '
'references'.split())
PARSERS = {'itp': parse_itp, 'rtp': parse_rtp}
def use_row(row):
return not row.skip
def get_mol(filename, name, skip_H=True):
return parse_file(filename, skip_H)[name]
@lru_cache()
def parse_file(filename, skip_H=True):
base, ext = osp.splitext(filename)
ext = ext.lstrip('.')
mols = PARSERS[ext](filename, skip_H)
return mols
def get_mapping(filename, normalize=False):
names, table, mapdict = mapping_parser(filename, normalize)
return names, table, mapdict
def first_letter(string):
if string.upper().startswith('CL'):
return 'Cl'
elif string.upper().startswith('FE'):
return 'Fe'
elif string.upper().startswith('MG'):
return 'Mg'
for char in string:
if char.isalpha():
return char
def add_element(mol):
for idx in mol:
node = mol.nodes[idx]
if 'element' not in node:
node['element'] = first_letter(node['name'])
def parse_db(xls_file, aa_dir, cg_dir, map_dir):
wb = openpyxl.load_workbook(xls_file, read_only=True)
ws = wb.active
molecules = []
for row in ws.iter_rows(min_row=2):
try:
row_vals = [cell.value for cell in row]
if len(row_vals) > len(COLUMNS._fields):
row_vals = row_vals[:len(COLUMNS._fields)]
else:
row_vals += [None] * (len(COLUMNS._fields) - len(row_vals))
row = COLUMNS(*row_vals)
except Exception as err:
print([cell.value for cell in row])
raise
if use_row(row):
aa_mol = get_mol(osp.join(aa_dir, row.aa_file), row.aa_name)
cg_mol = get_mol(osp.join(cg_dir, row.cg_file), row.cg_name)
map_names, table, mapdict = get_mapping(osp.join(map_dir, row.mapping_file), True)
add_element(aa_mol)
if set((row.aa_name, row.cg_name)) < set(map_names):
print('Mapping names seem off:')
print(map_names, row.aa_name, row.cg_name)
if row.smiles:
smiles_mol = read_smiles(row.smiles, explicit_hydrogen=False, zero_order_bonds=False)
for node_idx in smiles_mol:
smiles_mol.nodes[node_idx]['element'] = smiles_mol.nodes[node_idx]['element'].capitalize()
matcher = nx.isomorphism.GraphMatcher(aa_mol, smiles_mol, node_match=nx.isomorphism.categorical_node_match('element', 'H'))
match = list(matcher.match())
if not match:
print('Smiles and ITP for {} do not match'.format(row.aa_name))
print(len(aa_mol), aa_mol.nodes(data='element'))
print(len(aa_mol.edges), aa_mol.edges)
print(len(smiles_mol), smiles_mol.nodes(data='element'))
print(len(smiles_mol.edges), smiles_mol.edges)
else:
match = match[0]
for aa_idx, smi_idx in match.items():
smi_node = smiles_mol.nodes[smi_idx]
aa_node = aa_mol.nodes[aa_idx]
for attr, val in smi_node.items():
aa_node[attr] = val
for aa_idx, aa_jdx in aa_mol.edges:
smi_idx, smi_jdx = match[aa_idx], match[aa_jdx]
for attr, val in smiles_mol.edges[smi_idx, smi_jdx].items():
aa_mol.edges[aa_idx, aa_jdx][attr] = val
aa_mol.graph['smiles'] = row.smiles
else:
print('smiles for mol {} is missing.'.format(row.aa_name))
molecules.append(((row.aa_name, row.cg_name), aa_mol, cg_mol, table, mapdict))
print('Read {} molecules'.format(len(molecules)))
return molecules