Skip to content

Commit

Permalink
fix bug in make_plasmid_summary which was messing up references
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Nov 6, 2024
1 parent e7a58fe commit 36b04c5
Show file tree
Hide file tree
Showing 10 changed files with 94,341 additions and 27,304 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ config:
layout: elk
---
flowchart LR
SnapGene ==> Plasmids[17k+ plasmids]
SnapGene ==> Plasmids[~14k plasmids]
AddGene ==> Plasmids
Plasmids ==> Sites[extracted att sites]
Plasmids ==> SequenceFeatures[extracted sequence features]
Expand Down
10 changes: 7 additions & 3 deletions make_feature_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
import json
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature
import warnings
from tqdm import tqdm


def main(plasmid_summary_file, plasmid_site_dict_file, output_file):
Expand All @@ -60,7 +62,7 @@ def main(plasmid_summary_file, plasmid_site_dict_file, output_file):
with open(plasmid_site_dict_file) as f:
plasmid_site_dict = json.load(f)

for plasmid in plasmid_summary:
for plasmid in tqdm(plasmid_summary, desc="Extracting plasmid features"):
# If no att sites, skip and remove from plasmid summary
if len(plasmid_site_dict[plasmid["file"]]) == 0:
plasmid_summary.remove(plasmid)
Expand All @@ -70,8 +72,10 @@ def main(plasmid_summary_file, plasmid_site_dict_file, output_file):
with open(plasmid["file"], "br") as f:
record = SeqIO.read(f, "snapgene")
elif plasmid["source"] == "addgene":
with open(plasmid["file"], "r") as f:
record = SeqIO.read(f, "genbank")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
with open(plasmid["file"], "r") as f:
record = SeqIO.read(f, "genbank")
features: list[SeqFeature] = record.features
plasmid["att_sites"] = list(sorted(plasmid_site_dict[plasmid["file"]].keys()))
plasmid["features"] = set()
Expand Down
3 changes: 0 additions & 3 deletions make_plasmid_files.sh

This file was deleted.

8 changes: 6 additions & 2 deletions make_plasmid_site_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import json
import re
import glob
import warnings
from tqdm import tqdm


def main(input_folder, output_file):
Expand All @@ -29,11 +31,13 @@ def main(input_folder, output_file):

out_dict = dict()

for file in files:
for file in tqdm(files, desc="Processing plasmid files"):
plasmid_dict = dict()

file_format = "snapgene" if file.split(".")[-1] == "dna" else "genbank"
plasmid_record = SeqIO.read(file, file_format)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
plasmid_record = SeqIO.read(file, file_format)

# Find att sites
for feature in plasmid_record.features:
Expand Down
17 changes: 13 additions & 4 deletions make_plasmid_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,23 @@ def main(
input_folder, output_file, all_gateway_plasmids, addgene_kits, addgene_articles
):
addgene_id2name = dict()
addgene_id2reference_id = dict()
with open(all_gateway_plasmids) as f:
for line in f:
addgene_id, name = line.strip().split("\t")[:2]
addgene_id2name[addgene_id] = name
reference_id = line.strip().split("\t")[2:]
addgene_id2reference_id[addgene_id] = (
reference_id[0] if len(reference_id) > 0 else None
)

addgene_id2references = dict()
reference_id2reference_links = dict()
with open(addgene_articles) as f:
for line in f:
ls = line.strip().split("\t")
addgene_id = ls[0]
references = ls[1:]
addgene_id2references[addgene_id] = references
reference_id2reference_links[addgene_id] = references

addgene_id2kit = dict()
with open(addgene_kits) as f:
Expand Down Expand Up @@ -75,8 +80,12 @@ def main(
plasmid_dict["plasmid_name"] = addgene_id2name[addgene_id]
plasmid_dict["sequence-type"] = basename.split(".")[1]
plasmid_dict["addgene_id"] = addgene_id
if addgene_id in addgene_id2references:
plasmid_dict["references"] = addgene_id2references[addgene_id]
if addgene_id in addgene_id2reference_id:
reference_id = addgene_id2reference_id[addgene_id]
if reference_id is not None:
plasmid_dict["references"] = reference_id2reference_links[
reference_id
]
else:
plasmid_dict["references"] = []
if addgene_id in addgene_id2kit:
Expand Down
34 changes: 33 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ python = "^3.12"
biopython = "^1.84"
pydna = "^5.2.0"
pandas = "^2.2.3"
tqdm = "^4.67.0"


[tool.poetry.group.playwright.dependencies]
Expand Down
Loading

0 comments on commit 36b04c5

Please sign in to comment.