Skip to content

Commit

Permalink
Merge pull request #361 from bmeg/rc5-patch
Browse files Browse the repository at this point in the history
Fix G2P -> Publication links from DGIGB
  • Loading branch information
adamstruck authored Dec 23, 2019
2 parents e57def7 + a80511b commit 3ec2706
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 53 deletions.
32 changes: 16 additions & 16 deletions outputs.bmeg_manifest.dvc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
md5: 8799729d258acd405bdf6d92de68dfcf
md5: fa537571e66fb976ab6f547028ddf408
cmd: echo generating file manifest...
deps:
- md5: b2d2ab73b01758f77505caecc3963f29
Expand Down Expand Up @@ -69,23 +69,23 @@ deps:
path: outputs/ccle/maf.SomaticCallset_Aliquots_Aliquot.Edge.json.gz
- md5: 4dfc12d1a89eee58e38ef9109bbd64e4
path: outputs/celllines/Case_SameAs_Case.Edge.json.gz
- md5: 8e8e11059b5cbd3c1ae85573b30dabb0
- md5: 015143f9421a9fb5a3311f92043056c2
path: outputs/compound/normalized.Case_Compounds_Compound.Edge.json.gz
- md5: dd0ac5a944094d76a0cf12c4bff575da
- md5: 20b8a955bff703c00690081bebfbd372
path: outputs/compound/normalized.Compound.Vertex.json.gz
- md5: 488eee20c9e4f0c99070eb6dfbb3fdb7
- md5: acb155e76510e2661158e52272338a3c
path: outputs/compound/normalized.Compound_Cases_Case.Edge.json.gz
- md5: 967a3e45c1ea2270a9b9bb4763b9cff9
- md5: a0af3202a4dcef822bd16127407c043e
path: outputs/compound/normalized.Compound_DrugResponses_DrugResponse.Edge.json.gz
- md5: 21734e88b95e98c4eabac40c91cfbd45
- md5: 6a029d0ee0b8f531cce4410d0e8affa4
path: outputs/compound/normalized.Compound_G2PAssociations_G2PAssociation.Edge.json.gz
- md5: a9945dd00371f9ae553098b2b71f4ce8
- md5: d8cfbd16c89e661cf15df5c37cea5d72
path: outputs/compound/normalized.Compound_Projects_Project.Edge.json.gz
- md5: fb5f300161494a94d2493cfb5b704e68
- md5: 645c674a623e27bc84f9449ad8111683
path: outputs/compound/normalized.DrugResponse_Compounds_Compound.Edge.json.gz
- md5: c76490317a7a49562e903c52ddba2c6e
- md5: 69b9e6f4ee8226dac032bf8814d72ff2
path: outputs/compound/normalized.G2PAssociation_Compounds_Compound.Edge.json.gz
- md5: c53f52b19e36799ee099edc0ef3d7657
- md5: a56821071854cc805120007a99728f0e
path: outputs/compound/normalized.Project_Compounds_Compound.Edge.json.gz
- md5: cb3fcbfcee28d6434f8397355cff568a
path: outputs/ctrp/ctrp.Aliquot.Vertex.json.gz
Expand Down Expand Up @@ -113,15 +113,15 @@ deps:
path: outputs/ctrp/ctrp.Sample_Aliquots_Aliquot.Edge.json.gz
- md5: ce514bf610e31c6b36b3d2945d2e9ccc
path: outputs/ctrp/ctrp.Sample_Case_Case.Edge.json.gz
- md5: 3fa57b7c369e2c8dd2dc0d0b6c44766e
- md5: aac51ae53ff021ed9291b1aa2ae78076
path: outputs/dgidb/G2PAssociation.Vertex.json.gz
- md5: 166177ce31138f0ee32dc339fdfdd987
- md5: 758362f57be6a69c6969a48182e0ff18
path: outputs/dgidb/G2PAssociation_Genes_Gene.Edge.json.gz
- md5: 4f99308481017576d6cc208b6f514eab
- md5: 57d481f7135e3a8761ad8920ceca5fb0
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
- md5: 97282e6b782ab81889884ba6d9ba368e
- md5: a701cee66767a702c18b7b3d4cb4b829
path: outputs/dgidb/Gene_G2PAssociations_G2PAssociation.Edge.json.gz
- md5: 03f13f3513d5282b8a1f309429ae81f0
- md5: cecbf1a9f863ecfd0ce3db553becfad9
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
- md5: 3f89bd640983b7ea76b5dbbbcd846941
path: outputs/ensembl/Exon.Vertex.json.gz
Expand Down Expand Up @@ -359,7 +359,7 @@ deps:
path: outputs/phenotype/normalized.Phenotype_Samples_Sample.Edge.json.gz
- md5: 3729d0155dbf3f3f30b926a030a1cf95
path: outputs/phenotype/normalized.Sample_Phenotypes_Phenotype.Edge.json.gz
- md5: 87182b1ccb82bc03a2efb61ca77703d5
- md5: 7bbe3da8a9ade7327c41858d20107e58
path: outputs/publication/stub.Publication.Vertex.json.gz
- md5: 6bdd6ef5f03bc16023a4c59bfcec95db
path: outputs/pubmed/baseline/pubmed19n0001.Publication.Vertex.json.gz
Expand Down
26 changes: 13 additions & 13 deletions outputs/compound/normalized.compounds.dvc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
md5: 72113976b51fe192773b38a2d2071503
md5: a8590f84b821452a9ac88e92ab816e88
cmd: python3 transform/compound/transform.py
wdir: ../..
deps:
Expand All @@ -14,7 +14,7 @@ deps:
path: outputs/gdc/gdc.Compound.Vertex.json.gz
- md5: bdd1c3dedf67c2f3179f04771115a3c7
path: outputs/pharmacodb/Compound.Vertex.json.gz
- md5: f7a2bbfb7436d76e09b9e13bd3d0c9ff
- md5: 502320cf8546d8607d62a1f0518e97cc
path: outputs/dgidb/Compound.Vertex.json.gz
- md5: 90070ff6332de4a429188a69ea8d9633
path: outputs/pharmacodb/DrugResponse_Compounds_Compound.Edge.json.gz
Expand All @@ -36,52 +36,52 @@ deps:
path: outputs/pharmacodb/Project_Compounds_Compound.Edge.json.gz
- md5: 62990f11a0051b7357506699167a7611
path: outputs/pharmacodb/Compound_Projects_Project.Edge.json.gz
- md5: 9c4627785660215fa9adbf7c9f6cf172
- md5: 708a48991dc4d6051097e6a8e8175b7b
path: outputs/dgidb/G2PAssociation_Compounds_Compound.Edge.json.gz
- md5: fa0ae005deb28af2d935cc818c1394b4
- md5: 0e9b6635d4e8153d221d1c2a0604627a
path: outputs/dgidb/Compound_G2PAssociations_G2PAssociation.Edge.json.gz
outs:
- md5: dd0ac5a944094d76a0cf12c4bff575da
- md5: 20b8a955bff703c00690081bebfbd372
path: outputs/compound/normalized.Compound.Vertex.json.gz
cache: true
metric: false
persist: false
- md5: fb5f300161494a94d2493cfb5b704e68
- md5: 645c674a623e27bc84f9449ad8111683
path: outputs/compound/normalized.DrugResponse_Compounds_Compound.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 967a3e45c1ea2270a9b9bb4763b9cff9
- md5: a0af3202a4dcef822bd16127407c043e
path: outputs/compound/normalized.Compound_DrugResponses_DrugResponse.Edge.json.gz
cache: true
metric: false
persist: false
- md5: c53f52b19e36799ee099edc0ef3d7657
- md5: a56821071854cc805120007a99728f0e
path: outputs/compound/normalized.Project_Compounds_Compound.Edge.json.gz
cache: true
metric: false
persist: false
- md5: a9945dd00371f9ae553098b2b71f4ce8
- md5: d8cfbd16c89e661cf15df5c37cea5d72
path: outputs/compound/normalized.Compound_Projects_Project.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 488eee20c9e4f0c99070eb6dfbb3fdb7
- md5: acb155e76510e2661158e52272338a3c
path: outputs/compound/normalized.Compound_Cases_Case.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 8e8e11059b5cbd3c1ae85573b30dabb0
- md5: 015143f9421a9fb5a3311f92043056c2
path: outputs/compound/normalized.Case_Compounds_Compound.Edge.json.gz
cache: true
metric: false
persist: false
- md5: c76490317a7a49562e903c52ddba2c6e
- md5: 69b9e6f4ee8226dac032bf8814d72ff2
path: outputs/compound/normalized.G2PAssociation_Compounds_Compound.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 21734e88b95e98c4eabac40c91cfbd45
- md5: 6a029d0ee0b8f531cce4410d0e8affa4
path: outputs/compound/normalized.Compound_G2PAssociations_G2PAssociation.Edge.json.gz
cache: true
metric: false
Expand Down
20 changes: 10 additions & 10 deletions outputs/dgidb/dgidb.dvc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
md5: e720598cc16fd7bb5918bd09f0397589
md5: 34c1bbfdb0868d369c4283b8982f2137
cmd: python3 transform/dgidb/transform.py
wdir: ../..
deps:
Expand All @@ -8,45 +8,45 @@ deps:
path: source/drug_enricher/drug_alias.tsv
- md5: 64e7a82c87e7151a7c49846469157547
path: src/bmeg/enrichers/drug_enricher.py
- md5: 936873de1ea7a748293263ff126a2967
- md5: 1f1f691fa223e6c7c05cb9324a68d96a
path: transform/dgidb/transform.py
outs:
- md5: 3fa57b7c369e2c8dd2dc0d0b6c44766e
- md5: aac51ae53ff021ed9291b1aa2ae78076
path: outputs/dgidb/G2PAssociation.Vertex.json.gz
cache: true
metric: false
persist: false
- md5: f7a2bbfb7436d76e09b9e13bd3d0c9ff
- md5: 502320cf8546d8607d62a1f0518e97cc
path: outputs/dgidb/Compound.Vertex.json.gz
cache: true
metric: false
persist: false
- md5: 166177ce31138f0ee32dc339fdfdd987
- md5: 758362f57be6a69c6969a48182e0ff18
path: outputs/dgidb/G2PAssociation_Genes_Gene.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 4f99308481017576d6cc208b6f514eab
- md5: 57d481f7135e3a8761ad8920ceca5fb0
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 9c4627785660215fa9adbf7c9f6cf172
- md5: 708a48991dc4d6051097e6a8e8175b7b
path: outputs/dgidb/G2PAssociation_Compounds_Compound.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 03f13f3513d5282b8a1f309429ae81f0
- md5: cecbf1a9f863ecfd0ce3db553becfad9
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
cache: true
metric: false
persist: false
- md5: 97282e6b782ab81889884ba6d9ba368e
- md5: a701cee66767a702c18b7b3d4cb4b829
path: outputs/dgidb/Gene_G2PAssociations_G2PAssociation.Edge.json.gz
cache: true
metric: false
persist: false
- md5: fa0ae005deb28af2d935cc818c1394b4
- md5: 0e9b6635d4e8153d221d1c2a0604627a
path: outputs/dgidb/Compound_G2PAssociations_G2PAssociation.Edge.json.gz
cache: true
metric: false
Expand Down
10 changes: 5 additions & 5 deletions outputs/publication/stub_publications.dvc
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
md5: 9c3bbc179474e91c7ae2da409b34ca88
md5: b76b1becf88a1b639c81da86933d69dc
cmd: python3 transform/publication/transform.py
wdir: ../..
deps:
- md5: 15fc6631cd700953dc7bcc74728c44fa
- md5: ba4e56706b22e7572ef4591a999a5a58
path: transform/publication/transform.py
- md5: 1bef5c26bd8aab5b43de9c6e9f8d87de
path: outputs/g2p/G2PAssociation_Publications_Publication.Edge.json.gz
- md5: f2b1f26f98bc1f94fa528cbafbb6c60a
path: outputs/g2p/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
- md5: 4f99308481017576d6cc208b6f514eab
- md5: 57d481f7135e3a8761ad8920ceca5fb0
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
- md5: 03f13f3513d5282b8a1f309429ae81f0
- md5: cecbf1a9f863ecfd0ce3db553becfad9
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
- md5: 2a69a3d9fe9acd1c5465afc03be20cd2.dir
path: outputs/pubmed/baseline
Expand All @@ -23,7 +23,7 @@ deps:
- md5: 737bc587e7faef80a008f30b9f310643
path: outputs/pathway_commons/Interaction_Publications_Publication.Edge.json.gz
outs:
- md5: 87182b1ccb82bc03a2efb61ca77703d5
- md5: 7bbe3da8a9ade7327c41858d20107e58
path: outputs/publication/stub.Publication.Vertex.json.gz
cache: true
metric: false
Expand Down
2 changes: 1 addition & 1 deletion src/bmeg/bmeg-dictionary
1 change: 1 addition & 0 deletions tests/unit/dgidb/source/dgidb/interactions.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ PDGFRB PDGFR 5159 ChemblInteractions inhibitor CHEMBL576982 QUIZARTINIB QUIZARTI
MTOR 2109 2475 GuideToPharmacologyInteractions inhibitor 9361 WYE-354 CHEMBL561708 CHEMBL561708
DNTT DNTT 1791 NCI HYDROXYUREA HYDROXYUREA HYDROXYUREA CHEMBL467 291471
OPRM1 319 4988 GuideToPharmacologyInteractions agonist 3534 SUFENTANIL SUFENTANIL CHEMBL658
PDGFRA PDGFRA 5156 DoCM IMATINIB IMATINIB IMATINIB CHEMBL941 15928335,15685537,22718859,16638875,15146165,12949711,26130666,25157968,24132921,14645423,16954519,18794084,22745105
11 changes: 11 additions & 0 deletions tests/unit/dgidb/test_dgidb_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import contextlib
import pytest
import shutil
import json

from transform.dgidb.transform import transform
from bmeg.ioutils import reader


@pytest.fixture
Expand Down Expand Up @@ -50,6 +53,14 @@ def validate(helpers, interactions_file, emitter_directory):
exclude_labels=['Publication', 'Gene']
)

count = 0
with reader(pub_edge_file) as f:
for line in f:
e = json.loads(line)
assert e['to'] != 'Publication:ncbi.nlm.nih.gov/pubmed/'
count += 1
assert count == 16


def test_simple(helpers, interactions_file, emitter_directory):
""" simple test """
Expand Down
2 changes: 1 addition & 1 deletion transform/dgidb/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def transform(interactions_file="source/dgidb/interactions.tsv",
),
emit_backref=True
)
if line["PMIDs"] is None or line["PMIDs"] != "":
if line["PMIDs"] is None or line["PMIDs"] == "":
continue
pubs = line["PMIDs"].split(",")
for p in pubs:
Expand Down
1 change: 1 addition & 0 deletions transform/ensembl/uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def transform(protein_table_path='source/ensembl/Homo_sapiens.GRCh37.85.uniprot.

if uniprot_id not in emitted:
p = Uniprot(id=Uniprot.make_gid(uniprot_id),
uniprot_id=uniprot_id,
genome=GENOME_BUILD,
project_id=PROJECT_ID)
emitter.emit_vertex(p)
Expand Down
24 changes: 17 additions & 7 deletions transform/publication/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,33 +50,43 @@ def transform(
e = f = r = 0
for file in files:
f += 1
logging.info("processing HasSupportingReference file: {}/{}".format(f, nfiles))
logging.info("processing file: {}/{}".format(f, nfiles))
with reader(file) as ins:
for line in ins:
try:
edge = ujson.loads(line)
if 'Publication:' not in edge['gid']:
pid = None
# get edge components
if edge['to'].startswith('Publication'):
pid = edge['to']
elif edge['from'].startswith('Publication'):
pid = edge['from']
else:
logging.info('Edge {} has no publications that need transformation. skipping.'.format(file))
break
# get edge components
to = edge['to']
if to in dedup:

if pid in dedup:
r += 1
continue
dedup[to] = True
url = to.replace('Publication:', 'http://')

url = pid.replace('Publication:', 'http://')
publication = Publication(
id=Publication.make_gid(url),
url=url,
project_id=Project.make_gid("Reference")
)
emitter.emit_vertex(publication)

dedup[pid] = True
e += 1

except Exception as exc:
logging.error(str(exc))
raise exc

if e % batch_size == 0:
logging.info('emitted stub publication vertices: {}'.format(e))

logging.info('emitted stub publication vertices: {}'.format(e))
logging.info('existing publication refs found: {}'.format(r))
emitter.close()
Expand Down

0 comments on commit 3ec2706

Please sign in to comment.