-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdrug_classes_in_aro.py
103 lines (73 loc) · 2.81 KB
/
drug_classes_in_aro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from argnorm import lib
import pandas as pd
import subprocess
import os
os.makedirs('./data', exist_ok=True)
subprocess.check_call(['bash', 'get_antibiotic_class_data.bash'])
ARO = lib.get_aro_ontology()
[ab_molecule] = [t for t in ARO.terms() if t.name == 'antibiotic molecule']
nr_total_descendants = len(ab_molecule.subclasses(with_self=False).to_set())
direct_descendants = ab_molecule.subclasses(1, with_self=False).to_set()
assert any(t.name == 'antibiotic mixture' for t in direct_descendants)
nr_direct_descendants = len(direct_descendants)
nr_not_leaf = sum(bool(d.subclasses(with_self=False).to_set()) for d in ab_molecule.subclasses())
print(f'''# Antibiotic molecule descendant counts
Using ARO version bundled with argnorm {lib.__version__}
''')
print('ARO')
print(pd.Series(
{'Total descendants': nr_total_descendants,
'Direct descendants': nr_direct_descendants,
'Intermediate nodes': nr_not_leaf - nr_direct_descendants,
'Leaves': nr_total_descendants - nr_not_leaf},
name='Counts'
).to_frame().to_markdown())
print(f'# Other databases')
argannot = lib.get_aro_mapping_table('argannot').index\
.str.split(')')\
.str[0]\
.str.replace('(','')\
.str.lower() \
.value_counts()
deeparg = lib.get_aro_mapping_table('deeparg').index\
.str.split('|')\
.str[3]\
.str.lower()\
.value_counts()
megares = lib.get_aro_mapping_table('megares').index\
.str.split('|')\
.str[2]\
.str.lower()\
.value_counts()
ncbi = lib.get_aro_mapping_table('ncbi').index\
.str.split('|')\
.str[9]\
.str.lower()\
.value_counts()
sarg = pd.read_csv('./data/SARG_structure.tsv', sep='\t')['Type'].value_counts()
resfinder = pd.read_csv('./data/resfinder_antibiotic_classes.tsv', sep='\t')['Class'].value_counts()
resfinderfg = pd.read_csv('./data/resfinderfg_antibiotic_classes.csv', delimiter=';')
resfinderfg.columns = [x for x in range(8)]
resfinderfg = resfinderfg[1].value_counts()
print(f'''## ARG-ANNOT
{argannot.to_markdown()}
Number of unique antibiotic classes: {argannot.shape[0]}
## DeepARG
{deeparg.to_markdown()}
Number of unique antibiotic classes: {deeparg.shape[0]}
## MEGARes
{megares.to_markdown()}
Number of unique antibiotic classes: {megares.shape[0]}
## NCBI
{ncbi.to_markdown()}
Number of unique antibiotic classes: {ncbi.shape[0]}
## SARG
{sarg.to_markdown()}
Number of unique antibiotic classes: {sarg.shape[0]}
## ResFinder
{resfinder.to_markdown()}
Number of unique antibiotic classes: {resfinder.shape[0]}
## ResFinderFG
{resfinderfg.to_markdown()}
Number of unique antibiotic classes: {resfinderfg.shape[0]}
''')