-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindex.py
executable file
·149 lines (131 loc) · 5.4 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
"""Index HMDB protein/metabolite records with MongoDB and Elasticsearch,
Download page: http://www.hmdb.ca/downloads """
# TODO: Elasticsearch default mappings may not be good enough
from __future__ import print_function
import argparse
import os
from gzip import GzipFile
from zipfile import ZipFile
import xmltodict
from pymongo import IndexModel
from nosqlbiosets.dbutils import DBconnection
from nosqlbiosets.objutils import unifylistattributes
DOCTYPE_METABOLITE = 'hmdbmetabolite'
DOCTYPE_PROTEIN = 'hmdbprotein'
# Read HMDB Metabolites/Proteins files, index using the function indexf
def parse_hmdb_xmlfile(infile, indexf):
infile = str(infile)
print("Reading/indexing %s " % infile)
if infile.endswith(".gz"):
with GzipFile(infile) as inf:
xmltodict.parse(inf, item_depth=2, item_callback=indexf)
elif infile.endswith(".zip"):
with ZipFile(infile) as zipf:
for fname in zipf.namelist():
with zipf.open(fname) as inf:
xmltodict.parse(inf, item_depth=2,
item_callback=indexf)
else:
with open(infile, 'rb', buffering=1000) as inf:
xmltodict.parse(inf, item_depth=2, item_callback=indexf)
print("\nCompleted")
class Indexer(DBconnection):
def __init__(self, db, index, host, port, doctype):
self.doctype = doctype
self.index = index
super(Indexer, self).__init__(db, index, host, port, recreateindex=True)
if db == "MongoDB":
self.mcl = self.mdbi[doctype]
self.mcl.drop()
# Tune entries for better data representation
def tune(self, entry):
list_attrs = ["synonyms", "pathways"]
unifylistattributes(entry, list_attrs)
if "taxonomy" in entry:
list_attrs = ["alternative_parents", "substituents",
"external_descriptors"]
unifylistattributes(entry["taxonomy"], list_attrs)
# Index HMDB Metabolites/Proteins entry with Elasticsearch
def es_index_hmdb_entry(self, _, entry):
docid = entry['accession']
self.tune(entry)
if "taxonomy" in entry and entry['taxonomy'] is not None and\
'molecular_framework' in entry['taxonomy']:
del entry['taxonomy']['molecular_framework']
if "cs_description" in entry:
del entry['cs_description']
try:
self.es.index(index=self.index, doc_type=self.doctype,
id=docid, body=entry)
self.reportprogress()
r = True
except Exception as e:
print(e)
r = False
return r
# Index HMDB Metabolites/Proteins entry with MongoDB
def mongodb_index_hmdb_entry(self, _, entry):
docid = entry['accession']
self.tune(entry)
spec = {"_id": docid}
try:
self.mcl.update(spec, entry, upsert=True)
# TODO: replace update with insert
self.reportprogress()
r = True
except Exception as e:
print(e)
r = False
return r
def mongodb_indices(mdb, doctype):
if doctype == DOCTYPE_METABOLITE:
index = IndexModel([
("description", "text"), ("name", "text"),
("taxanomy.description", "text")])
mdb.create_indexes([index])
mdb.create_index("accession")
mdb.create_index("protein_associations.protein.protein_accession")
mdb.create_index("protein_associations.protein.gene_name")
else: # Proteins
index = IndexModel([
("gene_name", "text"), ("general_function", "text"),
("specific_function", "text")])
mdb.create_indexes([index])
mdb.create_index("accession")
mdb.create_index("metabolite_associations.metabolite.accession")
return
def main(infile, index, doctype, db, host=None, port=None):
if doctype is None:
if 'protein' in infile:
doctype = DOCTYPE_PROTEIN
else:
doctype = DOCTYPE_METABOLITE
indxr = Indexer(db, index, host, port, doctype)
if db == 'Elasticsearch':
parse_hmdb_xmlfile(infile, indxr.es_index_hmdb_entry)
indxr.es.indices.refresh(index=index)
else:
parse_hmdb_xmlfile(infile, indxr.mongodb_index_hmdb_entry)
mongodb_indices(indxr.mcl, doctype)
if __name__ == '__main__':
d = os.path.dirname(os.path.abspath(__file__))
parser = argparse.ArgumentParser(
description='Index HMDB proteins/metabolites datasets,'
' with Elasticsearch or MongoDB')
parser.add_argument('-infile', '--infile',
required=True,
help='Input file name')
parser.add_argument('--index',
default="hmdb",
help='Name of the Elasticsearch index or MongoDB db')
parser.add_argument('--doctype',
help='Document type (hmdbprotein or hmdbmetabolite)')
parser.add_argument('--host',
help='Elasticsearch or MongoDB server hostname')
parser.add_argument('--port',
help="Elasticsearch or MongoDB server port number")
parser.add_argument('--db', default='Elasticsearch',
help="Database: 'Elasticsearch' or 'MongoDB'")
args = parser.parse_args()
main(args.infile, args.index, args.doctype, args.db, args.host, args.port)