-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
73 lines (60 loc) · 2.3 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import glob
import pandas as pd
import os
import json
import urllib.request
from string import ascii_uppercase as alc
def load_data(data_folder):
# download all DDInter csv files
for i in alc:
try:
file = f"ddinter_downloads_code_{i}.csv"
url = f"http://ddinter.scbdd.com/static/media/download/{file}"
filepath = os.path.join(data_folder, file)
urllib.request.urlretrieve(url, filepath)
except Exception:
pass
# merge DDInter records from all csv files into one csv
csv_files = os.path.join(data_folder, "ddinter_downloads_code_*.csv")
joined_csv_files = glob.glob(csv_files)
merged_csv = pd.concat(
map(pd.read_csv, joined_csv_files), ignore_index=True)
# load file with scraped data
drug_info_file = os.path.join(data_folder, 'drug_data.json')
drug_info = open(drug_info_file)
drug_data = json.load(drug_info)['records']
drug_characteristics = ['chembl', 'pubchem', 'drugbank']
ids = []
for index in merged_csv.index:
DDInterID_A = merged_csv['DDInterID_A'][index]
Drug_A = merged_csv['Drug_A'][index]
DDInterID_A_index = int(DDInterID_A.split('DDInter')[-1])
DDInterID_B = merged_csv['DDInterID_B'][index]
Drug_B = merged_csv['Drug_B'][index]
DDInterID_B_index = int(DDInterID_B.split('DDInter')[-1])
Level = merged_csv['Level'][index]
id = DDInterID_A+'_'+DDInterID_B+'_'+Level
if id not in ids:
doc = {}
doc['_id'] = id
doc['drug_a'] = {
'ddinterid': DDInterID_A,
'name': Drug_A,
}
for characteristic in drug_characteristics:
info = drug_data[DDInterID_A_index-1][characteristic]
if len(info) > 0:
doc['drug_a'][characteristic] = info
doc['drug_b'] = {
'ddinterid': DDInterID_B,
'name': Drug_B,
}
for characteristic in drug_characteristics:
info = drug_data[DDInterID_B_index-1][characteristic]
if len(info) > 0:
doc['drug_b'][characteristic] = info
doc['level'] = Level
ids.append(id)
yield doc
else:
continue