forked from manulera/GateWayMine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_feature_dict.py
120 lines (105 loc) · 3.45 KB
/
make_feature_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Make a dictionary of features for each plasmid, including:
- plasmid_name
- att_sites
- features (name extracted from gene or label)
- source (snapgene or addgene)
- file (path to the plasmid file in this repository)
- if it's addgene:
- addgene_id
- references, if available as links
- kit (name and url), if it belongs to a kit
This file is then used in the GatewayMine web app.
E.g.
{
"source": "snapgene",
"plasmid_name": "pDEST15",
"att_sites": [
"attR1",
"attR2"
],
"features": [
"AmpR",
"AmpR promoter",
...
]
},
{
"source": "addgene",
"plasmid_name": "pDONR223_C1orf150_p.G2E",
"sequence-type": "addgene-full",
"addgene_id": "81309",
"references": [],
"kit": {
"name": "Broad Target Accelerator Plasmid Collections",
"url": "https://www.addgene.org/1000000103/"
},
"att_sites": [
"attL1",
"attL2"
],
"features": [
"L4440",
"M13 Forward",
...
]
},
"""
import json
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
import warnings
from tqdm import tqdm
def main(plasmid_summary_file, plasmid_site_dict_file, output_file):
with open(plasmid_summary_file) as f:
plasmid_summary = json.load(f)
with open(plasmid_site_dict_file) as f:
plasmid_site_dict = json.load(f)
for plasmid in tqdm(plasmid_summary, desc="Extracting plasmid features"):
# If no att sites, skip and remove from plasmid summary
if len(plasmid_site_dict[plasmid["file"]]) == 0:
plasmid_summary.remove(plasmid)
continue
if plasmid["source"] == "snapgene":
with open(plasmid["file"], "br") as f:
record = SeqIO.read(f, "snapgene")
elif plasmid["source"] == "addgene":
with warnings.catch_warnings():
warnings.simplefilter("ignore")
with open(plasmid["file"], "r") as f:
record = SeqIO.read(f, "genbank")
features: list[SeqFeature] = record.features
plasmid["att_sites"] = list(sorted(plasmid_site_dict[plasmid["file"]].keys()))
plasmid["features"] = set()
for feature in features:
if "label" in feature.qualifiers:
plasmid["features"].add(feature.qualifiers["label"][0])
elif "gene" in feature.qualifiers:
plasmid["features"].add(feature.qualifiers["gene"][0])
plasmid["features"] = list(sorted(plasmid["features"]))
# Remove file from plasmid summary
plasmid.pop("file")
with open(output_file, "w") as f:
json.dump(plasmid_summary, f, indent=4)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description=__doc__,
)
parser.add_argument(
"--plasmid-summary",
help="Path to the plasmid summary file",
default="results/plasmid_summary.json",
)
parser.add_argument(
"--plasmid-site-dict",
help="Path to the plasmid site dictionary",
default="results/plasmid_site_dict.json",
)
parser.add_argument(
"--output-file",
help="Path to the output file",
default="results/plasmid_features.json",
)
args = parser.parse_args()
main(args.plasmid_summary, args.plasmid_site_dict, args.output_file)