Skip to content

Commit

Permalink
reviewed HGNC index script
Browse files Browse the repository at this point in the history
  • Loading branch information
uludag committed Mar 24, 2019
1 parent 07a868d commit 2a736e0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 21 deletions.
29 changes: 18 additions & 11 deletions geneinfo/hgnc_geneinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def es_index_genes(dbc, genes):
dbc.es,
read_genes(genes),
index=dbc.index,
doc_type=DOCTYPE,
doc_type='_doc',
chunk_size=CHUNKSIZE
):
action, result = result.popitem()
Expand All @@ -77,16 +77,15 @@ def es_index_genes(dbc, genes):
return r


def mongodb_index_genes(mdbi, genes):
def mongodb_index_genes(mdbc, genes):
entries = list()
mdbi[DOCTYPE].delete_many({})
try:
for entry in read_genes(genes):
entries.append(entry)
if len(entries) == CHUNKSIZE:
mdbi[DOCTYPE].insert_many(entries)
mdbc.insert_many(entries)
entries = list()
mdbi[DOCTYPE].insert_many(entries)
mdbc.insert_many(entries)
except BulkWriteError as bwe:
pprint(bwe.details)
return
Expand Down Expand Up @@ -122,6 +121,9 @@ class GeneInfo(Base):
entrez_id = Column(Integer)
enzyme_id = Column(ARRAY(Text))
gene_family_id = Column(Text)
gene_group = Column(ARRAY(Text))
gene_group_id = Column(ARRAY(Integer))
gtrnadb = Column(Text)
hgnc_id = Column(Text)
imgt = Column(Text)
iuphar = Column(Text)
Expand All @@ -136,7 +138,7 @@ class GeneInfo(Base):
pubmed_id = Column(Text)
refseq_accession = Column(Text)
rgd_id = Column(Text)
rna_central_ids = Column(ARRAY(Text))
rna_central_id = Column(ARRAY(Text))
ucsc_id = Column(Text)
uniprot_ids = Column(ARRAY(Text))
vega_id = Column(Text)
Expand Down Expand Up @@ -169,15 +171,17 @@ def pgsql_index_genes(session, genes):
session.commit()


def main(db, infile, index, user=None, password=None, host=None, port=None):
def main(db, infile, index, doctype,
user=None, password=None, host=None, port=None):
if db in ["Elasticsearch", "MongoDB"]:
dbc = DBconnection(db, index, host=host, port=port, recreateindex=True)
dbc = DBconnection(db, index, collection=doctype, host=host, port=port,
recreateindex=True)
if dbc.db == "Elasticsearch":
read_and_index_hgnc_file(infile, dbc, es_index_genes)
dbc.es.indices.refresh(index=index)
elif dbc.db == "MongoDB":
dbc.mdbi.drop_collection(DOCTYPE)
read_and_index_hgnc_file(infile, dbc.mdbi, mongodb_index_genes)
read_and_index_hgnc_file(infile, dbc.mdbi[doctype],
mongodb_index_genes)
else:
session = pgsql_connect(host, port, user, password, index)
session.query(GeneInfo).delete()
Expand All @@ -194,6 +198,9 @@ def main(db, infile, index, user=None, password=None, host=None, port=None):
parser.add_argument('--index', default=INDEX,
help='Index name for Elasticsearch, '
'database name for MongoDB and PostgreSQL')
parser.add_argument('--doctype', default=DOCTYPE,
help='Collection name for MongoDB, '
'table name for PostgreSQL')
parser.add_argument('--host',
help='Hostname for the database server')
parser.add_argument('--port',
Expand All @@ -208,5 +215,5 @@ def main(db, infile, index, user=None, password=None, host=None, port=None):
help="Password for the database user, "
" supported with PostgreSQL option only")
args = parser.parse_args()
main(args.db, args.infile, args.index,
main(args.db, args.infile, args.index, args.doctype,
args.user, args.password, args.host, args.port)
17 changes: 7 additions & 10 deletions geneinfo/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
## HGNC gene info

[hgnc_geneinfo.py](hgnc_geneinfo.py); Index HGNC data using Elasticsearch,
MongoDB or PostgreSQL
MongoDB or PostgreSQL(naive)

Tested with June 2018 release
Tested with Mar 2019 release

```bash
# ~30M
wget -P ./data http://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json
wget -nc -P ./data http://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json

# Requires ~1m
./geneinfo/hgnc_geneinfo.py --infile ./data/hgnc_complete_set.json --db Elasticsearch
Expand All @@ -19,17 +19,14 @@ wget -P ./data http://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_comple

# Requires ~1m
# Assume PostgreSQL database with name geneinfo has already been created
# and the user `tests` have access to the database with password 'tests'
# and the user `geneinfo` have access to the database with password 'geneinfo'
# Use --hosts and --port options if the database host is different than localhost
# or if its port number is different than 5432
./geneinfo/hgnc_geneinfo.py --infile ./data/hgnc_complete_set.json\
--db PostgreSQL --index geneinfo --user tests --password tests
--db PostgreSQL --index geneinfo --user geneinfo --password geneinfo
```
PostgreSQL support is based on [SQLAlchemy](http://www.sqlalchemy.org) library

Note: A command line option to change MongoDB collection name and PostgreSQL table name
has not been implemented yet, it is defined with constant `DOCTYPE = 'hgncgeneinfo'`
in hgnc_geneinfo.py
PostgreSQL support is based on [SQLAlchemy](http://www.sqlalchemy.org) library,
table name is defined with constant `DOCTYPE` ('hgncgeneinfo')

### Similar work

Expand Down

0 comments on commit 2a736e0

Please sign in to comment.